Main analysis

Performance as a function of number of repetitions Minimum number of repetitions: 5, minimum number of weeks: 5

Cognition (SDMT)

params = list(
  test_code = "ips",
  test_metric_code = "correct_responses",
  unit = "SDMT: Correct Responses",
  unit_n = "patient",
  unit_time = "repetition",
  min_repetitions = 5,
  min_weeks = 5,
  predictor = "repetition",
  xlab = "Repetitions",
  bounded.growth.confidence.interval = T,
  up_to_date = "2021-05-01"
)
library(data.table) # fread
library(parsedate) # parse_date
library(dplyr) # group_by
library(tibble) # deframe
library(lme4) # lmer
library(mgcv) # gamm
library(quantreg) # rq
library(patchwork) # plot_layout
library(gridExtra) # grid.arrange
library(ggpubr) # ggscatter
library(ggtext) # geom_text
library(sjPlot) # plot_model

# Download from: https://dataset.floodlightopen.com/public-blobs-prod/complete_dataset.csv
data = fread("complete_dataset.csv", data.table=F)

# Prepare dataset
data = data[data$testCode == params$test_code & !data$participantIsControl,]
data$time = parse_date(data$testStartedAt)
data = data[data$time <= as.POSIXct(params$up_to_date, tz="UTC"),] # only analyse data up to (excluding) params$up_to_date
data = data[!duplicated(data),] # sometimes contains true duplicates for some reason (even with the same testResultMetricId)

# For "Finger Pinching" hand_used has to be determined
if (params$test_code == "pinching") {
  library(tidyr) # pivot_wider
  # just one means either "hand" or "successful_pinches" values are missing, remove those
  table(table(data$time))
  data = data[!data$time %in% names(which(table(data$time)==1)), ]
  data = as.data.frame(pivot_wider(data, id_cols=c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time"), names_from="testMetricCode", values_from="testResultMetricValue"))
} else {
  data = data[data$testMetricCode == params$test_metric_code,]
  data$hand_used = NA
  data = data[c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time", "testResultMetricValue", "hand_used")]
}

colnames(data) = c("id", "control", "sex", "birthyear", "time", "value", "hand_used")
data$age = year(data$time)-data$birthyear # Estimate age
data = data[order(as.character(data$id)),]

# 0 result values are discarded
data = data[!is.na(data$value) & data$value != 0,]

# Consider those supposedly younger than 18 (minimum study age) and older than 90 as NA
data$age[data$age < 18 | data$age > 90] = NA

data$id_original = data$id
data$id = paste0(data$id, "_hand", data$hand_used)

data$day = as.IDate(data$time)
round = function(x, digits=0) sprintf(paste0("%.", digits, "f"), x)

no_x = theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
no_y = theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())

linecolor = "#c71138"

Participant selection

# At least x weeks & repetitions
for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "repetition"] = (1:n)-1
  data[subset, "weeksSinceFirst"] = as.numeric(difftime(data[subset, "time"], data[subset, "time"][1], unit="weeks"))
}

n_orig = nrow(data)
n_patients_orig = length(unique(data$id_original))
n_hands_orig = length(unique(data$id))

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(weeksSinceFirst), repetitions=last(repetition), .groups="keep")

Among the total n=1095 patients with n=5715 repetitions, the median length of participation is 0.0 weeks (IQR 0.0-8.6, range 0.0-151.6) and the median number of repetitions is 1 (IQR 1-4, range 1-106).

data = data[data$id %in% participation_duration$id[participation_duration$weeks >= params$min_weeks & participation_duration$repetitions+1 >= params$min_repetitions],]

for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "daysSinceLast"] = as.numeric(difftime(data[subset, "time"], c(data[subset, "time"][1], data[subset, "time"][1:(n-1)]), unit="days"))
}

participation_duration = data %>% group_by(id) %>% summarise(sex=first(sex), mean_age=mean(age), weeks=last(weeksSinceFirst), repetitions=last(repetition), median_intertest_interval=median(daysSinceLast), IQR_intertest_interval=IQR(daysSinceLast), .groups="keep")

data$predictor = data[,params$predictor]

Inclusion criteria: participation for at least 5 weeks and at least 5 repetitions performed per test, leading to the analysis of n=251 / 1095 patients and n=4388 / 5715 tests. Among those, the median length of participation is 16.5 weeks (IQR 10.1-45.9, range 5.0-151.6) and the median number of repetitions is 11 (IQR 6.5-18, range 5-106).

t(data.frame(
  n_patients = paste0(length(unique(data$id_original)), " / ", n_patients_orig, " (", round(length(unique(data$id_original))/n_patients_orig*100,1), "%)"),
  n_hands = paste0(length(unique(data$id)), " / ", n_hands_orig, " (", round(length(unique(data$id))/n_hands_orig*100,1), "%)"),
  n_tests = paste0(nrow(data), " / ", n_orig, " (", round(nrow(data)/n_orig*100,1), "%)"),
  percent_female = paste0(round(prop.table(table(participation_duration$sex == "female"))[[2]]*100, 1)),
  age = paste0(round(median(participation_duration$mean_age,na.rm=T),1), " (", round(quantile(participation_duration$mean_age, 0.25, na.rm=T),1), "-", round(quantile(participation_duration$mean_age, 0.75, na.rm=T),1), ", range ", round(min(participation_duration$mean_age, na.rm=T),1), "-", round(max(participation_duration$mean_age, na.rm=T),1), ")"),
  repetitions = paste0(median(participation_duration$repetitions)+1, " repetitions (IQR ", quantile(participation_duration$repetitions+1, 0.25), "-", quantile(participation_duration$repetitions+1, 0.75), ", range ", min(participation_duration$repetitions+1), "-", max(participation_duration$repetitions+1), ")"),
  median_intertest_interval = paste0(round(median(participation_duration$median_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$median_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$median_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$median_intertest_interval),1), "-", round(max(participation_duration$median_intertest_interval),1), ")"),
  IQR_intertest_interval = paste0(round(median(participation_duration$IQR_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$IQR_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$IQR_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$IQR_intertest_interval),1), "-", round(max(participation_duration$IQR_intertest_interval),1), ")"),
  weeks = paste0(round(median(participation_duration$weeks),1), " weeks (IQR ", round(quantile(participation_duration$weeks, 0.25),1), "-", round(quantile(participation_duration$weeks, 0.75),1), ", range ", round(min(participation_duration$weeks),1), "-", round(max(participation_duration$weeks),1), ")")
))
##                           [,1]                                         
## n_patients                "251 / 1095 (22.9%)"                         
## n_hands                   "251 / 1095 (22.9%)"                         
## n_tests                   "4388 / 5715 (76.8%)"                        
## percent_female            "70.5"                                       
## age                       "50.1 (42.0-58.0, range 20.0-79.0)"          
## repetitions               "11 repetitions (IQR 6.5-18, range 5-106)"   
## median_intertest_interval "7.6 days (IQR 7.1-9.5, range 6.7-87.1)"     
## IQR_intertest_interval    "2.8 days (IQR 0.7-8.1, range 0.0-133.8)"    
## weeks                     "16.5 weeks (IQR 10.1-45.9, range 5.0-151.6)"

Summary level analysis

Difference test

df = as.data.frame(data %>% group_by(id) %>% summarise(first=first(value), last=last(value), mean=mean(value), weeksSinceFirst=max(weeksSinceFirst), repetition=n(), first_age=first(age), last_age=last(age), mean_age=mean(age), .groups="keep") %>% mutate(diff=last-first))

df$predictor = df[, params$predictor]

test = t.test(df$last, df$first, paired=T)
test
## 
##  Paired t-test
## 
## data:  df$last and df$first
## t = 21.067, df = 250, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   9.086812 10.960997
## sample estimates:
## mean of the differences 
##                 10.0239
mod0 = lm(diff ~ I(mean_age/10) + I(first/10), df)
summ0 = summary(mod0)
summ0
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.4317  -4.4998  -0.0443   4.4295  24.4693 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     24.1237     3.8580   6.253 1.76e-09 ***
## I(mean_age/10)  -0.4513     0.4952  -0.911    0.363    
## I(first/10)     -3.0882     0.4879  -6.329 1.15e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.92 on 247 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1671, Adjusted R-squared:  0.1604 
## F-statistic: 24.78 on 2 and 247 DF,  p-value: 1.555e-10
mod = lm(diff ~ I(mean_age/10) + I(first/10) + log10(predictor), df)
confint(mod)
##                      2.5 %     97.5 %
## (Intercept)      13.019407 27.3652977
## I(mean_age/10)   -2.076327 -0.2111537
## I(first/10)      -4.304726 -2.5062707
## log10(predictor)  5.412423 10.3436471
summ = summary(mod)
summ
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10) + log10(predictor), 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.2859  -3.7000  -0.1442   4.1498  21.8293 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       20.1924     3.6417   5.545 7.58e-08 ***
## I(mean_age/10)    -1.1437     0.4735  -2.416   0.0164 *  
## I(first/10)       -3.4055     0.4565  -7.459 1.49e-12 ***
## log10(predictor)   7.8780     1.2518   6.293 1.41e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.435 on 246 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2826, Adjusted R-squared:  0.2739 
## F-statistic: 32.31 on 3 and 246 DF,  p-value: < 2.2e-16
# additional variance explained by predictor
#print(summ$r.squared - summ0$r.squared)

print(paste0("Average observed improvement over baseline: ", round(test$estimate/mean(df$first)*100, 1), " (", round(test$conf[1]/mean(df$first)*100, 1), "-", round(test$conf[2]/mean(df$first)*100, 1), ")"))
## [1] "Average observed improvement over baseline: 26.1 (23.7-28.6)"
lab.y = 1.1*mean(df$last)

p1 = ggbarplot(data.frame(Timepoint=rep(c("First","Mean","Last"),each=nrow(df)), value=c(df$first,df$mean,df$last)), "Timepoint", "value", add="mean_se", label=T, lab.nb.digits=1, lab.vjust=1.9, ylab=params$unit) + xlab("Score") #+ stat_compare_means(comparisons = list(c("First","Last")), paired=T, method="t.test", label.y=lab.y) + scale_y_continuous(expand=expansion(mult=c(0,0.1)))

p2 = plot_model(summ, show.values=T, vline.color = "grey", show.intercept=T, colors=linecolor, title=paste0("Difference from First to Last Score, R²=", round(summ$r.squared, 2)), axis.labels=rev(c("Intercept", "Age (per 10 years)", "First score (per 10)", paste0(params$xlab, " (log 10)"))), value.offset=0.3, show.p=F) + ylab("β estimates")

(p1 + p2) + plot_layout(widths=c(2,5)) & theme_pubr(base_family="Serif")

Confounders

p_age_first = ggscatter(df, "mean_age", "first", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("First score") + theme_pubr(base_family="Serif")

p_age_pred = ggscatter(df, "mean_age", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Mean age") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_age_last = ggscatter(df, "mean_age", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Last score") + theme_pubr(base_family="Serif")

p_age_diff = ggscatter(df, "mean_age", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_first_pred = ggscatter(df, "first", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("First score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_first_last = ggscatter(df, "first", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_first_diff = ggscatter(df, "first", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_pred_last = ggscatter(df, "predictor", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Last score") + theme_pubr(base_family="Serif")

p_pred_diff = ggscatter(df, "predictor", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_last_diff = ggscatter(df, "last", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Last score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_last_pred = ggscatter(df, "last", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Last score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")


p_age = gghistogram(df, "mean_age", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_first = gghistogram(df, "first", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_last = gghistogram(df, "last", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_pred = gghistogram(df, "predictor", bins=15) + scale_x_log10() + xlab(NULL) + theme_pubr(base_family="Serif")
p_diff = gghistogram(df, "diff", bins=15) + xlab("Difference first to last") + theme_pubr(base_family="Serif")

#(((p1+xlab(NULL)) + (p2+xlab(NULL)+ylab(NULL))) / ((p3+xlab(NULL)) + (p4+xlab(NULL)+ylab(NULL))) / ((p5) | (p6+ylab(NULL)))) & theme_pubr(base_family="Serif")

#(p_age_first | p_first) / (p_age_last | p_first_last | p_last) / (p_age_pred | p_first_pred | p_last_pred | p_pred) / (p_age_diff | p_first_diff | p_last_diff | p_pred_diff)

m <- matrix(NA, 5, 5)
m[lower.tri(m, diag = T)] <- 1:15
grid.arrange(grobs=list(
  p_age, p_age_first+xlab(NULL), p_age_pred+xlab(NULL), p_age_last+xlab(NULL), p_age_diff,
  p_first, p_first_pred+xlab(NULL)+ylab(""), p_first_last+xlab(NULL)+ylab(""), p_first_diff+ylab(""),
  p_pred, p_pred_last+xlab(NULL)+ylab(""), p_pred_diff+ylab(""),
  p_last, p_last_diff+ylab(""), 
  p_diff
), layout_matrix=m, heights=c(1,1,1,1,1.1))
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#GGally::ggpairs(df[c("mean_age", "first", "last", "predictor", "diff")])
#pairs(df[c("mean_age", "first", "last", "predictor", "diff")], upper.panel=NULL)
#corrplot::corrplot(cor(df[c("mean_age", "first", "last", "predictor", "diff")], use="complete.obs"))

Learning curve: Model selection

smoothing_spline = gamm(value ~ s(predictor, bs="ps"), random=list(id=~1), data=data)
summary(smoothing_spline$lme)
## Linear mixed-effects model fit by maximum likelihood
##  Data: strip.offset(mf) 
##        AIC      BIC   logLik
##   24969.01 25000.94 -12479.5
## 
## Random effects:
##  Formula: ~Xr - 1 | g
##  Structure: pdIdnot
##            Xr1    Xr2    Xr3    Xr4    Xr5    Xr6    Xr7    Xr8
## StdDev: 5.6785 5.6785 5.6785 5.6785 5.6785 5.6785 5.6785 5.6785
## 
##  Formula: ~1 | id %in% g
##         (Intercept) Residual
## StdDev:    9.459975 3.642742
## 
## Fixed effects: y ~ X - 1 
##                     Value Std.Error   DF  t-value p-value
## X(Intercept)     47.92484   0.60361 4136 79.39700       0
## Xs(predictor)Fx1 33.02793   8.11804 4136  4.06846       0
##  Correlation: 
##                  X(Int)
## Xs(predictor)Fx1 0.003 
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -7.78184177 -0.53101332  0.05473965  0.59964708  4.47513183 
## 
## Number of Observations: 4388
## Number of Groups: 
##         g id %in% g 
##         1       251
summary(smoothing_spline$gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## value ~ s(predictor, bs = "ps")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  47.9248     0.6035   79.41   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df     F p-value    
## s(predictor) 8.14   8.14 365.3  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.126   
##   Scale est. = 13.27     n = 4388
REM_linear = lmer(value ~ (1|id) + predictor, data)
equ_linear = function(t) fixef(REM_linear)[1] + fixef(REM_linear)[2]*t
summary(REM_linear)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor
##    Data: data
## 
## REML criterion at convergence: 25903.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.3068 -0.5274  0.0905  0.6226  4.1418 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 90.20    9.497   
##  Residual             16.83    4.102   
## Number of obs: 4388, groups:  id, 251
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) 44.088932   0.606428   72.70
## predictor    0.183064   0.004772   38.36
## 
## Correlation of Fixed Effects:
##           (Intr)
## predictor -0.065
REM_quadratic = lmer(value ~ (1|id) + predictor + I(predictor^2), data)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
equ_quadratic = function(t) fixef(REM_quadratic)[1] + fixef(REM_quadratic)[2]*t + fixef(REM_quadratic)[3]*t^2
summary(REM_quadratic)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor + I(predictor^2)
##    Data: data
## 
## REML criterion at convergence: 25530.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.5414 -0.5298  0.0717  0.6076  4.0476 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 89.99    9.486   
##  Residual             15.32    3.915   
## Number of obs: 4388, groups:  id, 251
## 
## Fixed effects:
##                  Estimate Std. Error t value
## (Intercept)    42.9686500  0.6076484   70.71
## predictor       0.3925009  0.0113252   34.66
## I(predictor^2) -0.0030669  0.0001518  -20.20
## 
## Correlation of Fixed Effects:
##             (Intr) prdctr
## predictor   -0.109       
## I(prdctr^2)  0.091 -0.916
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
REM_bounded = nlmer(value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0|id) + (yf|id), data = data, start=c(yf=40, y0=20, log_alpha=-1))
y0=fixef(REM_bounded)["y0"]
yf=fixef(REM_bounded)["yf"]
log_alpha=fixef(REM_bounded)["log_alpha"]
equ_bounded = function(t, yf=fixef(REM_bounded)[["yf"]], y0=fixef(REM_bounded)[["y0"]], log_alpha=fixef(REM_bounded)[["log_alpha"]]) yf+(y0-yf)*exp(-exp(log_alpha)*t)
summary(REM_bounded)
## Warning in vcov.merMod(object, use.hessian = use.hessian): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Warning in vcov.merMod(object, correlation = correlation, sigm = sig): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Nonlinear mixed model fit by maximum likelihood  ['nlmerMod']
## Formula: value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0 | id) + (yf |  
##     id)
##    Data: data
## 
##      AIC      BIC   logLik deviance df.resid 
##  25040.5  25078.8 -12514.3  25028.5     4382 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.0591 -0.5246  0.0596  0.5963  3.8253 
## 
## Random effects:
##  Groups   Name Variance Std.Dev.
##  id       y0    95.33    9.764  
##  id.1     yf   171.06   13.079  
##  Residual       12.22    3.495  
## Number of obs: 4388, groups:  id, 251
## 
## Fixed effects:
##           Estimate Std. Error t value
## yf        56.74459    1.03625   54.76
## y0        40.65554    0.63254   64.27
## log_alpha -2.59218    0.04834  -53.62
## 
## Correlation of Fixed Effects:
##           yf     y0    
## y0        -0.016       
## log_alpha -0.454 -0.084
exp(log_alpha)
## log_alpha 
## 0.0748565
cat("Average improvement over baseline: ", (yf-y0)/y0*100)
## Average improvement over baseline:  39.57408
RMSE = sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) sqrt(mean(resid(mod)^2))) # RMSE
RMSE
## [1] 3.984912 3.801991 3.535307 3.314229
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) mean(abs(resid(mod)))) # MAE
## [1] 2.977195 2.828032 2.619453 2.497699
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) extractAIC(mod)) # edf & AIC: smoothing_spline$lme always has edf 5
##          [,1]     [,2]     [,3]     [,4]
## [1,]     4.00     5.00     5.00     6.00
## [2,] 25903.69 25516.28 24969.01 25040.51
edf = sapply(list(REM_linear, REM_quadratic, smoothing_spline$gam, REM_bounded), function(mod) { nrow(data)-df.residual(mod) }) # while smoothing_spline$gam often has much higher edf
edf
## [1] 4.000000 5.000000 9.139569 6.000000
RMSE = round(RMSE,1)
edf = round(edf,1)

Plot

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(predictor), .groups="keep") %>% deframe()
remaining_participants_bins = seq(0,350,by=10)
remaining_participants = data.frame(x=remaining_participants_bins, text=sapply(remaining_participants_bins, function(x) sum(participation_duration>=x)))

xmax = remaining_participants$x[which(remaining_participants$text<10)[1]]

range_90p =quantile(data$value, probs=c(0.05,0.95))

p1 = ggplot() +
  geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.1) +
  geom_line(aes(x,y), data.frame(x=0:xmax, y=predict(smoothing_spline$gam, newdata=data.frame(predictor=0:xmax))), linetype="longdash", size=1) + xlim(0,xmax) + ylim(range_90p[1], range_90p[2]) +
  stat_function(fun=equ_linear, color="blue", linetype="dotted", size=1) + 
  stat_function(fun=equ_quadratic, color="green4", linetype="dashed", size=1) + 
  stat_function(fun=equ_bounded, color=linecolor, size=1) + 
  theme_pubr(base_family="Serif") + no_x + xlab(NULL) + ylab(params$unit) +
  geom_richtext(aes(x,y,label=label,hjust=1), data.frame(x=0.8*xmax, y=range_90p[2]-(range_90p[2]-range_90p[1])/1.3, label=paste0("Model RMSE (edf):<br><span style='color:#0000ff'>····· Linear: ", RMSE[1], " (", edf[1], ") </span><br><span style='color:#008b00'>- - - Quadratic: ", RMSE[2], " (", edf[2], ") </span><br><span style='color:#000000'>— — Smoothing spline: ", RMSE[3], " (", edf[3], ") </span><br><span style='color:", linecolor, "'>— Bounded growth: ", RMSE[4], " (", edf[4], ") </span>")), family="Serif")

p2 = ggplot(remaining_participants[remaining_participants$x<=xmax,]) +
  geom_text(aes(x=x,y="A",label=text), family="Serif") +
  theme_pubr(base_family="Serif") + no_y + xlab(params$xlab) + ylab(paste0("Remaining \n ", params$unit_n, "s")) +
  scale_x_continuous(breaks=seq(0,xmax,by=10))

(p1 / p2) + plot_layout(heights=c(0.9,0.1))
## Warning: Removed 302 row(s) containing missing values (geom_path).

if (params$bounded.growth.confidence.interval) conf = confint.merMod(REM_bounded, c("y0","yf","log_alpha"), method="profile")
## Computing profile confidence intervals ...
## Warning in vcov.merMod(object, use.hessian = use.hessian): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Warning in vcov.merMod(object, correlation = correlation, sigm = sig): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
if (params$bounded.growth.confidence.interval) {
  print(conf)
  print(exp(conf[3,]))
  print(paste0("Average boundary improvement over baseline: ", round((yf-y0)/y0*100, 1), " (", round((conf[1,1]-conf[2,1])/conf[2,1]*100, 1), "-", round((conf[1,2]-conf[2,2])/conf[2,2]*100, 1), ")"))
}
##               2.5 %    97.5 %
## yf        54.629934 59.030012
## y0        39.403849 41.906512
## log_alpha -2.726682 -2.461826
##      2.5 %     97.5 % 
## 0.06543607 0.08527909 
## [1] "Average boundary improvement over baseline: 39.6 (38.6-40.9)"

Bounded growth model

equ_diff_REM_bounded = function(t) exp(log_alpha)*(yf-y0)*exp(exp(log_alpha)*-t)
equ_diff_get_time_REM_bounded = function(target_slope) log(exp(log_alpha)*(yf-y0)/target_slope)/exp(log_alpha)

equ_bounded_get_x = function(target_value) exp(-log_alpha)*log((yf-y0)/(yf-target_value))

growth_percentiles = c(0, 0.5, 0.9)
names_percentiles = c("baseline", "half-practice point", "90% practice")
selected_timepoints = equ_bounded_get_x(y0+(yf-y0)*growth_percentiles)
example_slopes_bounded = data.frame(
  x=selected_timepoints,
  y=equ_bounded(selected_timepoints),
  label=paste0("y=", round(equ_bounded(selected_timepoints),1), ", m=", signif(equ_diff_REM_bounded(selected_timepoints),2), " at ", params$unit_time, " ", round(selected_timepoints,0), ", ", names_percentiles),
  vjust=1.5
)
example_slopes_bounded = rbind(example_slopes_bounded, list(x=0.83*xmax, y=yf, label=paste0("boundary: ", round(yf, 1)), vjust=-1.0))

if (params$bounded.growth.confidence.interval) ribbon = data.frame(x=seq(0,xmax,0.05), ymin=equ_bounded(seq(0,xmax,0.05), conf["yf","2.5 %"], conf["y0","2.5 %"], conf["log_alpha","2.5 %"]), ymax=equ_bounded(seq(0,xmax,0.05), conf["yf","97.5 %"], conf["y0","97.5 %"], conf["log_alpha","97.5 %"]))

quant = quantile(table(data$id))
print(paste0("n tests = ", nrow(data), " (n ", params$unit_n, "s = ", length(unique(data$id)), ", median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 4388 (n patients = 251, median tests per patient: 11, IQR 6.5-18)"
p1 = ggplot() + geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.2) +
  theme_pubr(base_family="Serif") + scale_x_continuous(limits = c(0,xmax), expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + xlab(params$xlab) + ylab(params$unit) +
  geom_vline(xintercept=example_slopes_bounded[2,"x"], color=linecolor, linetype=2) +
  stat_function(fun=equ_bounded, color=linecolor, size=1) +
  geom_point(data=example_slopes_bounded[1:(nrow(example_slopes_bounded)-1),], aes(x,y), color=linecolor, size=5) +
  geom_text(data=example_slopes_bounded, aes(x,y,label=label, vjust=vjust), color=linecolor, hjust=-0.01, family="Serif")

if (params$bounded.growth.confidence.interval) p1 = p1 + geom_ribbon(aes(x=x, ymin=ymin, ymax=ymax), ribbon, fill=linecolor, alpha=0.3)

p1
## Warning: Removed 91 row(s) containing missing values (geom_path).

Quantile regression

if (is.null(params$censor_after)) {
  censor_after = as.integer(round(selected_timepoints[2])) # half-practice point
} else {
  censor_after = params$censor_after
}
data_censored = data[data$predictor <= censor_after,]

percentiles = c(0.05,0.25,0.5,0.75,0.95)

QR = rq(value ~ predictor, tau=percentiles, data_censored)

p_vals = sapply(1:length(summary(QR)), function(i) {
  summ = coef(summary(QR, se="ker")[[i]])
  print(summ)
  print(paste0("Intercept: ", round(summ[1,1],1), " (", round(summ[1,1]-1.96*summ[1,2],1), "-", round(summ[1,1]+1.96*summ[1,2],1), "), beta: ", round(summ[2,1],2), " (", round(summ[2,1]-1.96*summ[2,2],2), "-", round(summ[2,1]+1.96*summ[2,2],2), ")"))
  summ[2,4]
})
##                 Value Std. Error   t value     Pr(>|t|)
## (Intercept) 22.500000  1.0682255 21.062969 0.000000e+00
## predictor    1.166667  0.2014399  5.791636 8.016017e-09
## [1] "Intercept: 22.5 (20.4-24.6), beta: 1.17 (0.77-1.56)"
##                  Value Std. Error   t value    Pr(>|t|)
## (Intercept) 35.0000000  0.5341734 65.521795 0.00000e+00
## predictor    0.8333333  0.1059255  7.867164 5.77316e-15
## [1] "Intercept: 35.0 (34.0-36.0), beta: 0.83 (0.63-1.04)"
##             Value Std. Error   t value Pr(>|t|)
## (Intercept)    41  0.4823704 84.996921        0
## predictor       1  0.1047662  9.545067        0
## [1] "Intercept: 41.0 (40.1-41.9), beta: 1.00 (0.79-1.21)"
##             Value Std. Error   t value Pr(>|t|)
## (Intercept)    47  0.5122156 91.758239        0
## predictor       1  0.1087158  9.198297        0
## [1] "Intercept: 47.0 (46.0-48.0), beta: 1.00 (0.79-1.21)"
##              Value Std. Error   t value     Pr(>|t|)
## (Intercept) 55.875  0.7185765 77.757901 0.000000e+00
## predictor    1.125  0.1737523  6.474736 1.178009e-10
## [1] "Intercept: 55.9 (54.5-57.3), beta: 1.12 (0.78-1.47)"
p_vals = p.adjust(p_vals, method="bonferroni")

ANOVA = anova(QR)

quant = quantile(table(data_censored$id))
print(paste0("n tests = ", nrow(data_censored), " (median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 2111 (median tests per patient: 10, IQR 6.5-10)"
signif_p = function(x, digits=1) {
  x = signif(x, digits)
  if (as.character(x) == "0") return("< 2e-16")
  else return(paste0("= ", x))
}

ggplot() + geom_line(aes_string("predictor", "value", group="id"), data_censored, alpha=0.2, color="darkgrey") + theme_pubr(base_family="Serif") + scale_x_continuous(expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + theme(legend.position = "none") + xlab(params$xlab) + ylab(params$unit) +
  geom_abline(intercept=coef(QR)[1,], slope=coef(QR)[2,], color=linecolor) +
  geom_text(data=data.frame(intercept=coef(QR)[1,], label=paste0(percentiles*100, "th percentile: β = ", round(coef(QR)[2,],1), ", ", "p.adj ", sapply(p_vals,signif_p))),
            mapping=aes(x=1,y=intercept, label=label), color=linecolor, hjust="left", vjust=1, family="Serif") +
  coord_cartesian(xlim=c(0,censor_after)) +
  geom_text(aes(x=x,y=y,label=label), data.frame(x=0.8*censor_after, y=0, label=paste0("ANOVA p ", signif_p(ANOVA$table$pvalue, 1))), vjust=-1.5, family="Serif") +
  geom_vline(xintercept=censor_after, color=linecolor, linetype=2)

Dexterity (Finger Pinching)

params = list(
  test_code = "pinching",
  test_metric_code = "successful_pinches",
  unit = "Pinching: Successful Pinches",
  unit_n = "hand",
  unit_time = "repetition",
  min_repetitions = 5,
  min_weeks = 5,
  predictor = "repetition",
  xlab = "Repetitions",
  bounded.growth.confidence.interval = T,
  up_to_date = "2021-05-01"
)
library(data.table) # fread
library(parsedate) # parse_date
library(dplyr) # group_by
library(tibble) # deframe
library(lme4) # lmer
library(mgcv) # gamm
library(quantreg) # rq
library(patchwork) # plot_layout
library(gridExtra) # grid.arrange
library(ggpubr) # ggscatter
library(ggtext) # geom_text
library(sjPlot) # plot_model

# Download from: https://dataset.floodlightopen.com/public-blobs-prod/complete_dataset.csv
data = fread("complete_dataset.csv", data.table=F)

# Prepare dataset
data = data[data$testCode == params$test_code & !data$participantIsControl,]
data$time = parse_date(data$testStartedAt)
data = data[data$time <= as.POSIXct(params$up_to_date, tz="UTC"),] # only analyse data up to (excluding) params$up_to_date
data = data[!duplicated(data),] # sometimes contains true duplicates for some reason (even with the same testResultMetricId)

# For "Finger Pinching" hand_used has to be determined
if (params$test_code == "pinching") {
  library(tidyr) # pivot_wider
  # just one means either "hand" or "successful_pinches" values are missing, remove those
  table(table(data$time))
  data = data[!data$time %in% names(which(table(data$time)==1)), ]
  data = as.data.frame(pivot_wider(data, id_cols=c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time"), names_from="testMetricCode", values_from="testResultMetricValue"))
} else {
  data = data[data$testMetricCode == params$test_metric_code,]
  data$hand_used = NA
  data = data[c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time", "testResultMetricValue", "hand_used")]
}

colnames(data) = c("id", "control", "sex", "birthyear", "time", "value", "hand_used")
data$age = year(data$time)-data$birthyear # Estimate age
data = data[order(as.character(data$id)),]

# 0 result values are discarded
data = data[!is.na(data$value) & data$value != 0,]

# Consider those supposedly younger than 18 (minimum study age) and older than 90 as NA
data$age[data$age < 18 | data$age > 90] = NA

data$id_original = data$id
data$id = paste0(data$id, "_hand", data$hand_used)

data$day = as.IDate(data$time)
round = function(x, digits=0) sprintf(paste0("%.", digits, "f"), x)

no_x = theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
no_y = theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())

linecolor = "#c71138"

Participant selection

# At least x weeks & repetitions
for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "repetition"] = (1:n)-1
  data[subset, "weeksSinceFirst"] = as.numeric(difftime(data[subset, "time"], data[subset, "time"][1], unit="weeks"))
}

n_orig = nrow(data)
n_patients_orig = length(unique(data$id_original))
n_hands_orig = length(unique(data$id))

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(weeksSinceFirst), repetitions=last(repetition), .groups="keep")

Among the total n=1816 patients with n=20695 repetitions, the median length of participation is 0.6 weeks (IQR 0.0-8.1, range 0.0-151.7) and the median number of repetitions is 2 (IQR 1-7, range 1-370).

data = data[data$id %in% participation_duration$id[participation_duration$weeks >= params$min_weeks & participation_duration$repetitions+1 >= params$min_repetitions],]

for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "daysSinceLast"] = as.numeric(difftime(data[subset, "time"], c(data[subset, "time"][1], data[subset, "time"][1:(n-1)]), unit="days"))
}

participation_duration = data %>% group_by(id) %>% summarise(sex=first(sex), mean_age=mean(age), weeks=last(weeksSinceFirst), repetitions=last(repetition), median_intertest_interval=median(daysSinceLast), IQR_intertest_interval=IQR(daysSinceLast), .groups="keep")

data$predictor = data[,params$predictor]

Inclusion criteria: participation for at least 5 weeks and at least 5 repetitions performed per test, leading to the analysis of n=252 / 1059 patients , 470 / 1816 hands and n=17945 / 20695 tests. Among those, the median length of participation is 15.6 weeks (IQR 9.9-43.4, range 5.0-151.7) and the median number of repetitions is 18 (IQR 10-42.75, range 5-370).

t(data.frame(
  n_patients = paste0(length(unique(data$id_original)), " / ", n_patients_orig, " (", round(length(unique(data$id_original))/n_patients_orig*100,1), "%)"),
  n_hands = paste0(length(unique(data$id)), " / ", n_hands_orig, " (", round(length(unique(data$id))/n_hands_orig*100,1), "%)"),
  n_tests = paste0(nrow(data), " / ", n_orig, " (", round(nrow(data)/n_orig*100,1), "%)"),
  percent_female = paste0(round(prop.table(table(participation_duration$sex == "female"))[[2]]*100, 1)),
  age = paste0(round(median(participation_duration$mean_age,na.rm=T),1), " (", round(quantile(participation_duration$mean_age, 0.25, na.rm=T),1), "-", round(quantile(participation_duration$mean_age, 0.75, na.rm=T),1), ", range ", round(min(participation_duration$mean_age, na.rm=T),1), "-", round(max(participation_duration$mean_age, na.rm=T),1), ")"),
  repetitions = paste0(median(participation_duration$repetitions)+1, " repetitions (IQR ", quantile(participation_duration$repetitions+1, 0.25), "-", quantile(participation_duration$repetitions+1, 0.75), ", range ", min(participation_duration$repetitions+1), "-", max(participation_duration$repetitions+1), ")"),
  median_intertest_interval = paste0(round(median(participation_duration$median_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$median_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$median_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$median_intertest_interval),1), "-", round(max(participation_duration$median_intertest_interval),1), ")"),
  IQR_intertest_interval = paste0(round(median(participation_duration$IQR_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$IQR_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$IQR_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$IQR_intertest_interval),1), "-", round(max(participation_duration$IQR_intertest_interval),1), ")"),
  weeks = paste0(round(median(participation_duration$weeks),1), " weeks (IQR ", round(quantile(participation_duration$weeks, 0.25),1), "-", round(quantile(participation_duration$weeks, 0.75),1), ", range ", round(min(participation_duration$weeks),1), "-", round(max(participation_duration$weeks),1), ")")
))
##                           [,1]                                        
## n_patients                "252 / 1059 (23.8%)"                        
## n_hands                   "470 / 1816 (25.9%)"                        
## n_tests                   "17945 / 20695 (86.7%)"                     
## percent_female            "71.7"                                      
## age                       "49.8 (41.7-57.5, range 20.0-79.0)"         
## repetitions               "18 repetitions (IQR 10-42.75, range 5-370)"
## median_intertest_interval "3.1 days (IQR 2.1-4.9, range 1.9-39.2)"    
## IQR_intertest_interval    "3.0 days (IQR 1.0-7.0, range 0.0-101.5)"   
## weeks                     "15.6 weeks (IQR 9.9-43.4, range 5.0-151.7)"

Summary level analysis

Difference test

df = as.data.frame(data %>% group_by(id) %>% summarise(first=first(value), last=last(value), mean=mean(value), weeksSinceFirst=max(weeksSinceFirst), repetition=n(), first_age=first(age), last_age=last(age), mean_age=mean(age), .groups="keep") %>% mutate(diff=last-first))

df$predictor = df[, params$predictor]

test = t.test(df$last, df$first, paired=T)
test
## 
##  Paired t-test
## 
## data:  df$last and df$first
## t = 21.525, df = 469, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  13.41214 16.10701
## sample estimates:
## mean of the differences 
##                14.75957
mod0 = lm(diff ~ I(mean_age/10) + I(first/10), df)
summ0 = summary(mod0)
summ0
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.110  -9.844   0.538  10.620  32.248 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     33.9357     3.6391   9.325   <2e-16 ***
## I(mean_age/10)  -1.1000     0.6020  -1.827   0.0683 .  
## I(first/10)     -5.2868     0.5221 -10.127   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.48 on 465 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.182,  Adjusted R-squared:  0.1785 
## F-statistic: 51.73 on 2 and 465 DF,  p-value: < 2.2e-16
mod = lm(diff ~ I(mean_age/10) + I(first/10) + log10(predictor), df)
confint(mod)
##                      2.5 %    97.5 %
## (Intercept)      15.688484 29.150318
## I(mean_age/10)   -3.597866 -1.420346
## I(first/10)      -6.350855 -4.514504
## log10(predictor) 11.618785 16.777621
summ = summary(mod)
summ
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10) + log10(predictor), 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.795  -7.489   0.957   8.396  27.821 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       22.4194     3.4252   6.545 1.57e-10 ***
## I(mean_age/10)    -2.5091     0.5541  -4.529 7.56e-06 ***
## I(first/10)       -5.4327     0.4672 -11.627  < 2e-16 ***
## log10(predictor)  14.1982     1.3126  10.817  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.06 on 464 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3467, Adjusted R-squared:  0.3425 
## F-statistic: 82.09 on 3 and 464 DF,  p-value: < 2.2e-16
# additional variance explained by predictor
#print(summ$r.squared - summ0$r.squared)

print(paste0("Average observed improvement over baseline: ", round(test$estimate/mean(df$first)*100, 1), " (", round(test$conf[1]/mean(df$first)*100, 1), "-", round(test$conf[2]/mean(df$first)*100, 1), ")"))
## [1] "Average observed improvement over baseline: 56.6 (51.4-61.8)"
lab.y = 1.1*mean(df$last)

p1 = ggbarplot(data.frame(Timepoint=rep(c("First","Mean","Last"),each=nrow(df)), value=c(df$first,df$mean,df$last)), "Timepoint", "value", add="mean_se", label=T, lab.nb.digits=1, lab.vjust=1.9, ylab=params$unit) + xlab("Score") #+ stat_compare_means(comparisons = list(c("First","Last")), paired=T, method="t.test", label.y=lab.y) + scale_y_continuous(expand=expansion(mult=c(0,0.1)))

p2 = plot_model(summ, show.values=T, vline.color = "grey", show.intercept=T, colors=linecolor, title=paste0("Difference from First to Last Score, R²=", round(summ$r.squared, 2)), axis.labels=rev(c("Intercept", "Age (per 10 years)", "First score (per 10)", paste0(params$xlab, " (log 10)"))), value.offset=0.3, show.p=F) + ylab("β estimates")

(p1 + p2) + plot_layout(widths=c(2,5)) & theme_pubr(base_family="Serif")

Confounders

p_age_first = ggscatter(df, "mean_age", "first", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("First score") + theme_pubr(base_family="Serif")

p_age_pred = ggscatter(df, "mean_age", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Mean age") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_age_last = ggscatter(df, "mean_age", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Last score") + theme_pubr(base_family="Serif")

p_age_diff = ggscatter(df, "mean_age", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_first_pred = ggscatter(df, "first", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("First score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_first_last = ggscatter(df, "first", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_first_diff = ggscatter(df, "first", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_pred_last = ggscatter(df, "predictor", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Last score") + theme_pubr(base_family="Serif")

p_pred_diff = ggscatter(df, "predictor", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_last_diff = ggscatter(df, "last", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Last score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_last_pred = ggscatter(df, "last", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Last score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")


p_age = gghistogram(df, "mean_age", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_first = gghistogram(df, "first", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_last = gghistogram(df, "last", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_pred = gghistogram(df, "predictor", bins=15) + scale_x_log10() + xlab(NULL) + theme_pubr(base_family="Serif")
p_diff = gghistogram(df, "diff", bins=15) + xlab("Difference first to last") + theme_pubr(base_family="Serif")

#(((p1+xlab(NULL)) + (p2+xlab(NULL)+ylab(NULL))) / ((p3+xlab(NULL)) + (p4+xlab(NULL)+ylab(NULL))) / ((p5) | (p6+ylab(NULL)))) & theme_pubr(base_family="Serif")

#(p_age_first | p_first) / (p_age_last | p_first_last | p_last) / (p_age_pred | p_first_pred | p_last_pred | p_pred) / (p_age_diff | p_first_diff | p_last_diff | p_pred_diff)

m <- matrix(NA, 5, 5)
m[lower.tri(m, diag = T)] <- 1:15
grid.arrange(grobs=list(
  p_age, p_age_first+xlab(NULL), p_age_pred+xlab(NULL), p_age_last+xlab(NULL), p_age_diff,
  p_first, p_first_pred+xlab(NULL)+ylab(""), p_first_last+xlab(NULL)+ylab(""), p_first_diff+ylab(""),
  p_pred, p_pred_last+xlab(NULL)+ylab(""), p_pred_diff+ylab(""),
  p_last, p_last_diff+ylab(""), 
  p_diff
), layout_matrix=m, heights=c(1,1,1,1,1.1))
## Warning: Removed 2 rows containing non-finite values (stat_bin).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#GGally::ggpairs(df[c("mean_age", "first", "last", "predictor", "diff")])
#pairs(df[c("mean_age", "first", "last", "predictor", "diff")], upper.panel=NULL)
#corrplot::corrplot(cor(df[c("mean_age", "first", "last", "predictor", "diff")], use="complete.obs"))

Learning curve: Model selection

smoothing_spline = gamm(value ~ s(predictor, bs="ps"), random=list(id=~1), data=data)
summary(smoothing_spline$lme)
## Linear mixed-effects model fit by maximum likelihood
##  Data: strip.offset(mf) 
##        AIC      BIC    logLik
##   125820.5 125859.5 -62905.26
## 
## Random effects:
##  Formula: ~Xr - 1 | g
##  Structure: pdIdnot
##              Xr1      Xr2      Xr3      Xr4      Xr5      Xr6      Xr7      Xr8
## StdDev: 7.882111 7.882111 7.882111 7.882111 7.882111 7.882111 7.882111 7.882111
## 
##  Formula: ~1 | id %in% g
##         (Intercept) Residual
## StdDev:    11.77696  7.63915
## 
## Fixed effects: y ~ X - 1 
##                     Value Std.Error    DF  t-value p-value
## X(Intercept)     42.46120  0.555801 17474 76.39641       0
## Xs(predictor)Fx1 45.84515  9.671501 17474  4.74023       0
##  Correlation: 
##                  X(Int)
## Xs(predictor)Fx1 0.006 
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -6.27200948 -0.49795419  0.07459264  0.60154891  7.10975647 
## 
## Number of Observations: 17945
## Number of Groups: 
##         g id %in% g 
##         1       470
summary(smoothing_spline$gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## value ~ s(predictor, bs = "ps")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  42.4612     0.5558    76.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df     F p-value    
## s(predictor) 8.352  8.352 636.1  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.209   
##   Scale est. = 58.357    n = 17945
REM_linear = lmer(value ~ (1|id) + predictor, data)
equ_linear = function(t) fixef(REM_linear)[1] + fixef(REM_linear)[2]*t
summary(REM_linear)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor
##    Data: data
## 
## REML criterion at convergence: 128354.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.2240 -0.4944  0.0992  0.6193  5.7347 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 146.79   12.116  
##  Residual              67.56    8.219  
## Number of obs: 17945, groups:  id, 470
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) 35.967096   0.568250   63.30
## predictor    0.066612   0.001424   46.78
## 
## Correlation of Fixed Effects:
##           (Intr)
## predictor -0.048
REM_quadratic = lmer(value ~ (1|id) + predictor + I(predictor^2), data)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
equ_quadratic = function(t) fixef(REM_quadratic)[1] + fixef(REM_quadratic)[2]*t + fixef(REM_quadratic)[3]*t^2
summary(REM_quadratic)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor + I(predictor^2)
##    Data: data
## 
## REML criterion at convergence: 127487.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.2080 -0.5027  0.0813  0.6176  6.3466 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 141.93   11.913  
##  Residual              64.28    8.017  
## Number of obs: 17945, groups:  id, 470
## 
## Fixed effects:
##                  Estimate Std. Error t value
## (Intercept)     3.477e+01  5.600e-01   62.08
## predictor       1.582e-01  3.339e-03   47.38
## I(predictor^2) -3.698e-04  1.226e-05  -30.16
## 
## Correlation of Fixed Effects:
##             (Intr) prdctr
## predictor   -0.084       
## I(prdctr^2)  0.071 -0.909
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
REM_bounded = nlmer(value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0|id) + (yf|id), data = data, start=c(yf=40, y0=20, log_alpha=-1))
y0=fixef(REM_bounded)["y0"]
yf=fixef(REM_bounded)["yf"]
log_alpha=fixef(REM_bounded)["log_alpha"]
equ_bounded = function(t, yf=fixef(REM_bounded)[["yf"]], y0=fixef(REM_bounded)[["y0"]], log_alpha=fixef(REM_bounded)[["log_alpha"]]) yf+(y0-yf)*exp(-exp(log_alpha)*t)
summary(REM_bounded)
## Nonlinear mixed model fit by maximum likelihood  ['nlmerMod']
## Formula: value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0 | id) + (yf |  
##     id)
##    Data: data
## 
##      AIC      BIC   logLik deviance df.resid 
## 123112.3 123159.1 -61550.2 123100.3    17939 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.9910 -0.4896  0.0740  0.6091  5.5039 
## 
## Random effects:
##  Groups   Name Variance Std.Dev.
##  id       y0   157.7    12.56   
##  id.1     yf   639.7    25.29   
##  Residual       47.6     6.90   
## Number of obs: 17945, groups:  id, 470
## 
## Fixed effects:
##           Estimate Std. Error t value
## yf        57.63459    1.55630   37.03
## y0        31.00648    0.59943   51.73
## log_alpha -3.65092    0.03953  -92.35
## 
## Correlation of Fixed Effects:
##           yf     y0    
## y0        -0.034       
## log_alpha -0.415 -0.103
exp(log_alpha)
##  log_alpha 
## 0.02596735
cat("Average improvement over baseline: ", (yf-y0)/y0*100)
## Average improvement over baseline:  85.87919
RMSE = sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) sqrt(mean(resid(mod)^2))) # RMSE
RMSE
## [1] 8.114177 7.914272 7.539605 6.749939
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) mean(abs(resid(mod)))) # MAE
## [1] 6.033296 5.873121 5.558197 4.998443
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) extractAIC(mod)) # edf & AIC: smoothing_spline$lme always has edf 5
##          [,1]     [,2]     [,3]     [,4]
## [1,]      4.0      5.0      5.0      6.0
## [2,] 128351.9 127466.3 125820.5 123112.3
edf = sapply(list(REM_linear, REM_quadratic, smoothing_spline$gam, REM_bounded), function(mod) { nrow(data)-df.residual(mod) }) # while smoothing_spline$gam often has much higher edf
edf
## [1] 4.000000 5.000000 9.352153 6.000000
RMSE = round(RMSE,1)
edf = round(edf,1)

Plot

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(predictor), .groups="keep") %>% deframe()
remaining_participants_bins = seq(0,350,by=10)
remaining_participants = data.frame(x=remaining_participants_bins, text=sapply(remaining_participants_bins, function(x) sum(participation_duration>=x)))

xmax = remaining_participants$x[which(remaining_participants$text<10)[1]]

range_90p =quantile(data$value, probs=c(0.05,0.95))

p1 = ggplot() +
  geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.1) +
  geom_line(aes(x,y), data.frame(x=0:xmax, y=predict(smoothing_spline$gam, newdata=data.frame(predictor=0:xmax))), linetype="longdash", size=1) + xlim(0,xmax) + ylim(range_90p[1], range_90p[2]) +
  stat_function(fun=equ_linear, color="blue", linetype="dotted", size=1) + 
  stat_function(fun=equ_quadratic, color="green4", linetype="dashed", size=1) + 
  stat_function(fun=equ_bounded, color=linecolor, size=1) + 
  theme_pubr(base_family="Serif") + no_x + xlab(NULL) + ylab(params$unit) +
  geom_richtext(aes(x,y,label=label,hjust=1), data.frame(x=0.8*xmax, y=range_90p[2]-(range_90p[2]-range_90p[1])/1.3, label=paste0("Model RMSE (edf):<br><span style='color:#0000ff'>····· Linear: ", RMSE[1], " (", edf[1], ") </span><br><span style='color:#008b00'>- - - Quadratic: ", RMSE[2], " (", edf[2], ") </span><br><span style='color:#000000'>— — Smoothing spline: ", RMSE[3], " (", edf[3], ") </span><br><span style='color:", linecolor, "'>— Bounded growth: ", RMSE[4], " (", edf[4], ") </span>")), family="Serif")

p2 = ggplot(remaining_participants[remaining_participants$x<=xmax,]) +
  geom_text(aes(x=x,y="A",label=text), family="Serif") +
  theme_pubr(base_family="Serif") + no_y + xlab(params$xlab) + ylab(paste0("Remaining \n ", params$unit_n, "s")) +
  scale_x_continuous(breaks=seq(0,xmax,by=10))

(p1 / p2) + plot_layout(heights=c(0.9,0.1))
## Warning: Removed 964 row(s) containing missing values (geom_path).

if (params$bounded.growth.confidence.interval) conf = confint.merMod(REM_bounded, c("y0","yf","log_alpha"), method="profile")
## Computing profile confidence intervals ...
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
if (params$bounded.growth.confidence.interval) {
  print(conf)
  print(exp(conf[3,]))
  print(paste0("Average boundary improvement over baseline: ", round((yf-y0)/y0*100, 1), " (", round((conf[1,1]-conf[2,1])/conf[2,1]*100, 1), "-", round((conf[1,2]-conf[2,2])/conf[2,2]*100, 1), ")"))
}
##               2.5 %   97.5 %
## yf        54.639426 60.80591
## y0        29.828866 32.18362
## log_alpha -3.729284 -3.57407
##      2.5 %     97.5 % 
## 0.02401002 0.02804148 
## [1] "Average boundary improvement over baseline: 85.9 (83.2-88.9)"

Bounded growth model

equ_diff_REM_bounded = function(t) exp(log_alpha)*(yf-y0)*exp(exp(log_alpha)*-t)
equ_diff_get_time_REM_bounded = function(target_slope) log(exp(log_alpha)*(yf-y0)/target_slope)/exp(log_alpha)

equ_bounded_get_x = function(target_value) exp(-log_alpha)*log((yf-y0)/(yf-target_value))

growth_percentiles = c(0, 0.5, 0.9)
names_percentiles = c("baseline", "half-practice point", "90% practice")
selected_timepoints = equ_bounded_get_x(y0+(yf-y0)*growth_percentiles)
example_slopes_bounded = data.frame(
  x=selected_timepoints,
  y=equ_bounded(selected_timepoints),
  label=paste0("y=", round(equ_bounded(selected_timepoints),1), ", m=", signif(equ_diff_REM_bounded(selected_timepoints),2), " at ", params$unit_time, " ", round(selected_timepoints,0), ", ", names_percentiles),
  vjust=1.5
)
example_slopes_bounded = rbind(example_slopes_bounded, list(x=0.83*xmax, y=yf, label=paste0("boundary: ", round(yf, 1)), vjust=-1.0))

if (params$bounded.growth.confidence.interval) ribbon = data.frame(x=seq(0,xmax,0.05), ymin=equ_bounded(seq(0,xmax,0.05), conf["yf","2.5 %"], conf["y0","2.5 %"], conf["log_alpha","2.5 %"]), ymax=equ_bounded(seq(0,xmax,0.05), conf["yf","97.5 %"], conf["y0","97.5 %"], conf["log_alpha","97.5 %"]))

quant = quantile(table(data$id))
print(paste0("n tests = ", nrow(data), " (n ", params$unit_n, "s = ", length(unique(data$id)), ", median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 17945 (n hands = 470, median tests per hand: 18, IQR 10-42.75)"
p1 = ggplot() + geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.2) +
  theme_pubr(base_family="Serif") + scale_x_continuous(limits = c(0,xmax), expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + xlab(params$xlab) + ylab(params$unit) +
  geom_vline(xintercept=example_slopes_bounded[2,"x"], color=linecolor, linetype=2) +
  stat_function(fun=equ_bounded, color=linecolor, size=1) +
  geom_point(data=example_slopes_bounded[1:(nrow(example_slopes_bounded)-1),], aes(x,y), color=linecolor, size=5) +
  geom_text(data=example_slopes_bounded, aes(x,y,label=label, vjust=vjust), color=linecolor, hjust=-0.01, family="Serif")

if (params$bounded.growth.confidence.interval) p1 = p1 + geom_ribbon(aes(x=x, ymin=ymin, ymax=ymax), ribbon, fill=linecolor, alpha=0.3)

p1
## Warning: Removed 480 row(s) containing missing values (geom_path).

Quantile regression

if (is.null(params$censor_after)) {
  censor_after = as.integer(round(selected_timepoints[2])) # half-practice point
} else {
  censor_after = params$censor_after
}
data_censored = data[data$predictor <= censor_after,]

percentiles = c(0.05,0.25,0.5,0.75,0.95)

QR = rq(value ~ predictor, tau=percentiles, data_censored)

p_vals = sapply(1:length(summary(QR)), function(i) {
  summ = coef(summary(QR, se="ker")[[i]])
  print(summ)
  print(paste0("Intercept: ", round(summ[1,1],1), " (", round(summ[1,1]-1.96*summ[1,2],1), "-", round(summ[1,1]+1.96*summ[1,2],1), "), beta: ", round(summ[2,1],2), " (", round(summ[2,1]-1.96*summ[2,2],2), "-", round(summ[2,1]+1.96*summ[2,2],2), ")"))
  summ[2,4]
})
##                 Value Std. Error   t value     Pr(>|t|)
## (Intercept) 8.4545455 0.49444260 17.099145 0.000000e+00
## predictor   0.2727273 0.03885682  7.018775 2.408296e-12
## [1] "Intercept: 8.5 (7.5-9.4), beta: 0.27 (0.20-0.35)"
##                  Value Std. Error  t value Pr(>|t|)
## (Intercept) 20.8571429 0.34196863 60.99139        0
## predictor    0.5714286 0.02945366 19.40094        0
## [1] "Intercept: 20.9 (20.2-21.5), beta: 0.57 (0.51-0.63)"
##             Value Std. Error  t value Pr(>|t|)
## (Intercept) 30.70 0.36373533 84.40203        0
## predictor    0.65 0.02542666 25.56372        0
## [1] "Intercept: 30.7 (30.0-31.4), beta: 0.65 (0.60-0.70)"
##             Value Std. Error   t value Pr(>|t|)
## (Intercept) 42.52  0.3733377 113.89154        0
## predictor    0.48  0.0252041  19.04452        0
## [1] "Intercept: 42.5 (41.8-43.3), beta: 0.48 (0.43-0.53)"
##                  Value Std. Error   t value Pr(>|t|)
## (Intercept) 54.0769231 0.43694387 123.76172        0
## predictor    0.3846154 0.03197911  12.02708        0
## [1] "Intercept: 54.1 (53.2-54.9), beta: 0.38 (0.32-0.45)"
p_vals = p.adjust(p_vals, method="bonferroni")

ANOVA = anova(QR)

quant = quantile(table(data_censored$id))
print(paste0("n tests = ", nrow(data_censored), " (median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 8593 (median tests per hand: 18, IQR 10-28)"
signif_p = function(x, digits=1) {
  x = signif(x, digits)
  if (as.character(x) == "0") return("< 2e-16")
  else return(paste0("= ", x))
}

ggplot() + geom_line(aes_string("predictor", "value", group="id"), data_censored, alpha=0.2, color="darkgrey") + theme_pubr(base_family="Serif") + scale_x_continuous(expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + theme(legend.position = "none") + xlab(params$xlab) + ylab(params$unit) +
  geom_abline(intercept=coef(QR)[1,], slope=coef(QR)[2,], color=linecolor) +
  geom_text(data=data.frame(intercept=coef(QR)[1,], label=paste0(percentiles*100, "th percentile: β = ", round(coef(QR)[2,],1), ", ", "p.adj ", sapply(p_vals,signif_p))),
            mapping=aes(x=1,y=intercept, label=label), color=linecolor, hjust="left", vjust=1, family="Serif") +
  coord_cartesian(xlim=c(0,censor_after)) +
  geom_text(aes(x=x,y=y,label=label), data.frame(x=0.8*censor_after, y=0, label=paste0("ANOVA p ", signif_p(ANOVA$table$pvalue, 1))), vjust=-1.5, family="Serif") +
  geom_vline(xintercept=censor_after, color=linecolor, linetype=2)

Mobility (Two Minute Walk)

params = list(
  test_code = "two_min_walk",
  test_metric_code = "steps",
  unit = "Two Minute Walk: Steps",
  unit_n = "patient",
  unit_time = "repetition",
  min_repetitions = 5,
  min_weeks = 5,
  predictor = "repetition",
  xlab = "Repetitions",
  bounded.growth.confidence.interval = F,
  censor_after = 9, # allow comparison with cognition
  up_to_date = "2021-05-01"
)
library(data.table) # fread
library(parsedate) # parse_date
library(dplyr) # group_by
library(tibble) # deframe
library(lme4) # lmer
library(mgcv) # gamm
library(quantreg) # rq
library(patchwork) # plot_layout
library(gridExtra) # grid.arrange
library(ggpubr) # ggscatter
library(ggtext) # geom_text
library(sjPlot) # plot_model

# Download from: https://dataset.floodlightopen.com/public-blobs-prod/complete_dataset.csv
data = fread("complete_dataset.csv", data.table=F)

# Prepare dataset
data = data[data$testCode == params$test_code & !data$participantIsControl,]
data$time = parse_date(data$testStartedAt)
data = data[data$time <= as.POSIXct(params$up_to_date, tz="UTC"),] # only analyse data up to (excluding) params$up_to_date
data = data[!duplicated(data),] # sometimes contains true duplicates for some reason (even with the same testResultMetricId)

# For "Finger Pinching" hand_used has to be determined
if (params$test_code == "pinching") {
  library(tidyr) # pivot_wider
  # just one means either "hand" or "successful_pinches" values are missing, remove those
  table(table(data$time))
  data = data[!data$time %in% names(which(table(data$time)==1)), ]
  data = as.data.frame(pivot_wider(data, id_cols=c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time"), names_from="testMetricCode", values_from="testResultMetricValue"))
} else {
  data = data[data$testMetricCode == params$test_metric_code,]
  data$hand_used = NA
  data = data[c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time", "testResultMetricValue", "hand_used")]
}

colnames(data) = c("id", "control", "sex", "birthyear", "time", "value", "hand_used")
data$age = year(data$time)-data$birthyear # Estimate age
data = data[order(as.character(data$id)),]

# 0 result values are discarded
data = data[!is.na(data$value) & data$value != 0,]

# Consider those supposedly younger than 18 (minimum study age) and older than 90 as NA
data$age[data$age < 18 | data$age > 90] = NA

data$id_original = data$id
data$id = paste0(data$id, "_hand", data$hand_used)

data$day = as.IDate(data$time)
round = function(x, digits=0) sprintf(paste0("%.", digits, "f"), x)

no_x = theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
no_y = theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())

linecolor = "#c71138"

Participant selection

# At least x weeks & repetitions
for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "repetition"] = (1:n)-1
  data[subset, "weeksSinceFirst"] = as.numeric(difftime(data[subset, "time"], data[subset, "time"][1], unit="weeks"))
}

n_orig = nrow(data)
n_patients_orig = length(unique(data$id_original))
n_hands_orig = length(unique(data$id))

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(weeksSinceFirst), repetitions=last(repetition), .groups="keep")

Among the total n=540 patients with n=14031 repetitions, the median length of participation is 0.9 weeks (IQR 0.0-11.2, range 0.0-133.5) and the median number of repetitions is 3 (IQR 1-14.25, range 1-735).

data = data[data$id %in% participation_duration$id[participation_duration$weeks >= params$min_weeks & participation_duration$repetitions+1 >= params$min_repetitions],]

for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "daysSinceLast"] = as.numeric(difftime(data[subset, "time"], c(data[subset, "time"][1], data[subset, "time"][1:(n-1)]), unit="days"))
}

participation_duration = data %>% group_by(id) %>% summarise(sex=first(sex), mean_age=mean(age), weeks=last(weeksSinceFirst), repetitions=last(repetition), median_intertest_interval=median(daysSinceLast), IQR_intertest_interval=IQR(daysSinceLast), .groups="keep")

data$predictor = data[,params$predictor]

Inclusion criteria: participation for at least 5 weeks and at least 5 repetitions performed per test, leading to the analysis of n=161 / 540 patients and n=12997 / 14031 tests. Among those, the median length of participation is 16.4 weeks (IQR 10.8-46.4, range 5.3-133.5) and the median number of repetitions is 37 (IQR 15-84, range 5-735).

t(data.frame(
  n_patients = paste0(length(unique(data$id_original)), " / ", n_patients_orig, " (", round(length(unique(data$id_original))/n_patients_orig*100,1), "%)"),
  n_hands = paste0(length(unique(data$id)), " / ", n_hands_orig, " (", round(length(unique(data$id))/n_hands_orig*100,1), "%)"),
  n_tests = paste0(nrow(data), " / ", n_orig, " (", round(nrow(data)/n_orig*100,1), "%)"),
  percent_female = paste0(round(prop.table(table(participation_duration$sex == "female"))[[2]]*100, 1)),
  age = paste0(round(median(participation_duration$mean_age,na.rm=T),1), " (", round(quantile(participation_duration$mean_age, 0.25, na.rm=T),1), "-", round(quantile(participation_duration$mean_age, 0.75, na.rm=T),1), ", range ", round(min(participation_duration$mean_age, na.rm=T),1), "-", round(max(participation_duration$mean_age, na.rm=T),1), ")"),
  repetitions = paste0(median(participation_duration$repetitions)+1, " repetitions (IQR ", quantile(participation_duration$repetitions+1, 0.25), "-", quantile(participation_duration$repetitions+1, 0.75), ", range ", min(participation_duration$repetitions+1), "-", max(participation_duration$repetitions+1), ")"),
  median_intertest_interval = paste0(round(median(participation_duration$median_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$median_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$median_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$median_intertest_interval),1), "-", round(max(participation_duration$median_intertest_interval),1), ")"),
  IQR_intertest_interval = paste0(round(median(participation_duration$IQR_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$IQR_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$IQR_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$IQR_intertest_interval),1), "-", round(max(participation_duration$IQR_intertest_interval),1), ")"),
  weeks = paste0(round(median(participation_duration$weeks),1), " weeks (IQR ", round(quantile(participation_duration$weeks, 0.25),1), "-", round(quantile(participation_duration$weeks, 0.75),1), ", range ", round(min(participation_duration$weeks),1), "-", round(max(participation_duration$weeks),1), ")")
))
##                           [,1]                                         
## n_patients                "161 / 540 (29.8%)"                          
## n_hands                   "161 / 540 (29.8%)"                          
## n_tests                   "12997 / 14031 (92.6%)"                      
## percent_female            "72.0"                                       
## age                       "50.0 (41.7-58.0, range 20.0-74.3)"          
## repetitions               "37 repetitions (IQR 15-84, range 5-735)"    
## median_intertest_interval "1.3 days (IQR 1.0-2.9, range 1.0-24.9)"     
## IQR_intertest_interval    "1.9 days (IQR 0.7-4.6, range 0.1-39.2)"     
## weeks                     "16.4 weeks (IQR 10.8-46.4, range 5.3-133.5)"

Summary level analysis

Difference test

df = as.data.frame(data %>% group_by(id) %>% summarise(first=first(value), last=last(value), mean=mean(value), weeksSinceFirst=max(weeksSinceFirst), repetition=n(), first_age=first(age), last_age=last(age), mean_age=mean(age), .groups="keep") %>% mutate(diff=last-first))

df$predictor = df[, params$predictor]

test = t.test(df$last, df$first, paired=T)
test
## 
##  Paired t-test
## 
## data:  df$last and df$first
## t = 0.61336, df = 160, p-value = 0.5405
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.922237  9.357020
## sample estimates:
## mean of the differences 
##                2.217391
mod0 = lm(diff ~ I(mean_age/10) + I(first/10), df)
summ0 = summary(mod0)
summ0
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -133.799  -19.684    6.243   23.605  141.065 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     53.5479    25.3623   2.111 0.036329 *  
## I(mean_age/10)   0.5238     3.3466   0.157 0.875817    
## I(first/10)     -2.6593     0.7265  -3.660 0.000344 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.24 on 157 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.08709,    Adjusted R-squared:  0.07546 
## F-statistic: 7.488 on 2 and 157 DF,  p-value: 0.0007829
mod = lm(diff ~ I(mean_age/10) + I(first/10) + log10(predictor), df)
confint(mod)
##                        2.5 %     97.5 %
## (Intercept)       0.05856387 101.087320
## I(mean_age/10)   -7.32284922   6.491036
## I(first/10)      -4.28838819  -1.341943
## log10(predictor) -7.57631209  21.101800
summ = summary(mod)
summ
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10) + log10(predictor), 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -134.951  -19.355    5.857   23.186  133.001 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       50.5729    25.5732   1.978 0.049738 *  
## I(mean_age/10)    -0.4159     3.4967  -0.119 0.905473    
## I(first/10)       -2.8152     0.7458  -3.775 0.000227 ***
## log10(predictor)   6.7627     7.2592   0.932 0.352979    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.26 on 156 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.09214,    Adjusted R-squared:  0.07468 
## F-statistic: 5.277 on 3 and 156 DF,  p-value: 0.001711
# additional variance explained by predictor
#print(summ$r.squared - summ0$r.squared)

print(paste0("Average observed improvement over baseline: ", round(test$estimate/mean(df$first)*100, 1), " (", round(test$conf[1]/mean(df$first)*100, 1), "-", round(test$conf[2]/mean(df$first)*100, 1), ")"))
## [1] "Average observed improvement over baseline: 1.1 (-2.4-4.6)"
lab.y = 1.1*mean(df$last)

p1 = ggbarplot(data.frame(Timepoint=rep(c("First","Mean","Last"),each=nrow(df)), value=c(df$first,df$mean,df$last)), "Timepoint", "value", add="mean_se", label=T, lab.nb.digits=1, lab.vjust=1.9, ylab=params$unit) + xlab("Score") #+ stat_compare_means(comparisons = list(c("First","Last")), paired=T, method="t.test", label.y=lab.y) + scale_y_continuous(expand=expansion(mult=c(0,0.1)))

p2 = plot_model(summ, show.values=T, vline.color = "grey", show.intercept=T, colors=linecolor, title=paste0("Difference from First to Last Score, R²=", round(summ$r.squared, 2)), axis.labels=rev(c("Intercept", "Age (per 10 years)", "First score (per 10)", paste0(params$xlab, " (log 10)"))), value.offset=0.3, show.p=F) + ylab("β estimates")

(p1 + p2) + plot_layout(widths=c(2,5)) & theme_pubr(base_family="Serif")

Confounders

p_age_first = ggscatter(df, "mean_age", "first", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("First score") + theme_pubr(base_family="Serif")

p_age_pred = ggscatter(df, "mean_age", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Mean age") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_age_last = ggscatter(df, "mean_age", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Last score") + theme_pubr(base_family="Serif")

p_age_diff = ggscatter(df, "mean_age", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_first_pred = ggscatter(df, "first", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("First score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_first_last = ggscatter(df, "first", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_first_diff = ggscatter(df, "first", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_pred_last = ggscatter(df, "predictor", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Last score") + theme_pubr(base_family="Serif")

p_pred_diff = ggscatter(df, "predictor", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_last_diff = ggscatter(df, "last", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Last score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_last_pred = ggscatter(df, "last", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Last score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")


p_age = gghistogram(df, "mean_age", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_first = gghistogram(df, "first", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_last = gghistogram(df, "last", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_pred = gghistogram(df, "predictor", bins=15) + scale_x_log10() + xlab(NULL) + theme_pubr(base_family="Serif")
p_diff = gghistogram(df, "diff", bins=15) + xlab("Difference first to last") + theme_pubr(base_family="Serif")

#(((p1+xlab(NULL)) + (p2+xlab(NULL)+ylab(NULL))) / ((p3+xlab(NULL)) + (p4+xlab(NULL)+ylab(NULL))) / ((p5) | (p6+ylab(NULL)))) & theme_pubr(base_family="Serif")

#(p_age_first | p_first) / (p_age_last | p_first_last | p_last) / (p_age_pred | p_first_pred | p_last_pred | p_pred) / (p_age_diff | p_first_diff | p_last_diff | p_pred_diff)

m <- matrix(NA, 5, 5)
m[lower.tri(m, diag = T)] <- 1:15
grid.arrange(grobs=list(
  p_age, p_age_first+xlab(NULL), p_age_pred+xlab(NULL), p_age_last+xlab(NULL), p_age_diff,
  p_first, p_first_pred+xlab(NULL)+ylab(""), p_first_last+xlab(NULL)+ylab(""), p_first_diff+ylab(""),
  p_pred, p_pred_last+xlab(NULL)+ylab(""), p_pred_diff+ylab(""),
  p_last, p_last_diff+ylab(""), 
  p_diff
), layout_matrix=m, heights=c(1,1,1,1,1.1))
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#GGally::ggpairs(df[c("mean_age", "first", "last", "predictor", "diff")])
#pairs(df[c("mean_age", "first", "last", "predictor", "diff")], upper.panel=NULL)
#corrplot::corrplot(cor(df[c("mean_age", "first", "last", "predictor", "diff")], use="complete.obs"))

Learning curve: Model selection

smoothing_spline = gamm(value ~ s(predictor, bs="ps"), random=list(id=~1), data=data)
summary(smoothing_spline$lme)
## Linear mixed-effects model fit by maximum likelihood
##  Data: strip.offset(mf) 
##        AIC      BIC    logLik
##   122134.3 122171.7 -61062.17
## 
## Random effects:
##  Formula: ~Xr - 1 | g
##  Structure: pdIdnot
##              Xr1      Xr2      Xr3      Xr4      Xr5      Xr6      Xr7      Xr8
## StdDev: 2.966859 2.966859 2.966859 2.966859 2.966859 2.966859 2.966859 2.966859
## 
##  Formula: ~1 | id %in% g
##         (Intercept) Residual
## StdDev:    46.81943 25.74474
## 
## Fixed effects: y ~ X - 1 
##                      Value Std.Error    DF  t-value p-value
## X(Intercept)     205.30328  3.727371 12835 55.07992  0.0000
## Xs(predictor)Fx1 -14.25125  9.018131 12835 -1.58029  0.1141
##  Correlation: 
##                  X(Int)
## Xs(predictor)Fx1 0.015 
## 
## Standardized Within-Group Residuals:
##          Min           Q1          Med           Q3          Max 
## -11.30836268  -0.27957744   0.09322983   0.42742641   6.69782984 
## 
## Number of Observations: 12997
## Number of Groups: 
##         g id %in% g 
##         1       161
summary(smoothing_spline$gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## value ~ s(predictor, bs = "ps")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  205.303      3.727   55.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df     F  p-value    
## s(predictor) 5.879  5.879 4.502 0.000127 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.00781   
##   Scale est. = 662.79    n = 12997
REM_linear = lmer(value ~ (1|id) + predictor, data)
equ_linear = function(t) fixef(REM_linear)[1] + fixef(REM_linear)[2]*t
summary(REM_linear)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor
##    Data: data
## 
## REML criterion at convergence: 122140.5
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -11.2951  -0.2782   0.0944   0.4320   6.6383 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 2218.6   47.10   
##  Residual              664.3   25.77   
## Number of obs: 12997, groups:  id, 161
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept) 205.069823   3.737751  54.864
## predictor    -0.002631   0.002544  -1.034
## 
## Correlation of Fixed Effects:
##           (Intr)
## predictor -0.027
REM_quadratic = lmer(value ~ (1|id) + predictor + I(predictor^2), data)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
equ_quadratic = function(t) fixef(REM_quadratic)[1] + fixef(REM_quadratic)[2]*t + fixef(REM_quadratic)[3]*t^2
summary(REM_quadratic)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor + I(predictor^2)
##    Data: data
## 
## REML criterion at convergence: 122153.8
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -11.2977  -0.2803   0.0963   0.4302   6.6705 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 2209     47.00   
##  Residual              664     25.77   
## Number of obs: 12997, groups:  id, 161
## 
## Fixed effects:
##                  Estimate Std. Error t value
## (Intercept)     2.047e+02  3.733e+00  54.830
## predictor       1.217e-02  5.924e-03   2.055
## I(predictor^2) -3.029e-05  1.095e-05  -2.767
## 
## Correlation of Fixed Effects:
##             (Intr) prdctr
## predictor   -0.047       
## I(prdctr^2)  0.039 -0.903
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
REM_bounded = nlmer(value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0|id) + (yf|id), data = data, start=c(yf=40, y0=20, log_alpha=-1))
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
y0=fixef(REM_bounded)["y0"]
yf=fixef(REM_bounded)["yf"]
log_alpha=fixef(REM_bounded)["log_alpha"]
equ_bounded = function(t, yf=fixef(REM_bounded)[["yf"]], y0=fixef(REM_bounded)[["y0"]], log_alpha=fixef(REM_bounded)[["log_alpha"]]) yf+(y0-yf)*exp(-exp(log_alpha)*t)
summary(REM_bounded)
## Warning in vcov.merMod(object, use.hessian = use.hessian): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Warning in vcov.merMod(object, correlation = correlation, sigm = sig): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Nonlinear mixed model fit by maximum likelihood  ['nlmerMod']
## Formula: value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0 | id) + (yf |  
##     id)
##    Data: data
## 
##      AIC      BIC   logLik deviance df.resid 
## 120864.4 120909.2 -60426.2 120852.4    12991 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -12.0839  -0.2904   0.0757   0.4076   5.1580 
## 
## Random effects:
##  Groups   Name Variance Std.Dev.
##  id       y0   2528.6   50.28   
##  id.1     yf   3967.8   62.99   
##  Residual       584.8   24.18   
## Number of obs: 12997, groups:  id, 161
## 
## Fixed effects:
##            Estimate Std. Error t value
## yf        207.27795    5.81502   35.65
## y0        205.01053    4.02191   50.97
## log_alpha  -3.84930    0.06442  -59.76
## 
## Correlation of Fixed Effects:
##           yf     y0    
## y0        -0.052       
## log_alpha  0.007 -0.005
## convergence code: 0
## failure to converge in 10000 evaluations
exp(log_alpha)
##  log_alpha 
## 0.02129461
cat("Average improvement over baseline: ", (yf-y0)/y0*100)
## Average improvement over baseline:  1.105999
RMSE = sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) sqrt(mean(resid(mod)^2))) # RMSE
RMSE
## [1] 25.61528 25.60831 25.58158 23.92620
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) mean(abs(resid(mod)))) # MAE
## [1] 15.89611 15.89554 15.85751 14.71605
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) extractAIC(mod)) # edf & AIC: smoothing_spline$lme always has edf 5
##          [,1]     [,2]     [,3]     [,4]
## [1,]      4.0      5.0      5.0      6.0
## [2,] 122142.8 122137.2 122134.3 120864.4
edf = sapply(list(REM_linear, REM_quadratic, smoothing_spline$gam, REM_bounded), function(mod) { nrow(data)-df.residual(mod) }) # while smoothing_spline$gam often has much higher edf
edf
## [1] 4.000000 5.000000 6.879167 6.000000
RMSE = round(RMSE,1)
edf = round(edf,1)

Plot

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(predictor), .groups="keep") %>% deframe()
remaining_participants_bins = seq(0,350,by=10)
remaining_participants = data.frame(x=remaining_participants_bins, text=sapply(remaining_participants_bins, function(x) sum(participation_duration>=x)))

xmax = remaining_participants$x[which(remaining_participants$text<10)[1]]

range_90p =quantile(data$value, probs=c(0.05,0.95))

p1 = ggplot() +
  geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.1) +
  geom_line(aes(x,y), data.frame(x=0:xmax, y=predict(smoothing_spline$gam, newdata=data.frame(predictor=0:xmax))), linetype="longdash", size=1) + xlim(0,xmax) + ylim(range_90p[1], range_90p[2]) +
  stat_function(fun=equ_linear, color="blue", linetype="dotted", size=1) + 
  stat_function(fun=equ_quadratic, color="green4", linetype="dashed", size=1) + 
  stat_function(fun=equ_bounded, color=linecolor, size=1) + 
  theme_pubr(base_family="Serif") + no_x + xlab(NULL) + ylab(params$unit) +
  geom_richtext(aes(x,y,label=label,hjust=1), data.frame(x=0.8*xmax, y=range_90p[2]-(range_90p[2]-range_90p[1])/1.3, label=paste0("Model RMSE (edf):<br><span style='color:#0000ff'>····· Linear: ", RMSE[1], " (", edf[1], ") </span><br><span style='color:#008b00'>- - - Quadratic: ", RMSE[2], " (", edf[2], ") </span><br><span style='color:#000000'>— — Smoothing spline: ", RMSE[3], " (", edf[3], ") </span><br><span style='color:", linecolor, "'>— Bounded growth: ", RMSE[4], " (", edf[4], ") </span>")), family="Serif")

p2 = ggplot(remaining_participants[remaining_participants$x<=xmax,]) +
  geom_text(aes(x=x,y="A",label=text), family="Serif") +
  theme_pubr(base_family="Serif") + no_y + xlab(params$xlab) + ylab(paste0("Remaining \n ", params$unit_n, "s")) +
  scale_x_continuous(breaks=seq(0,xmax,by=10))

(p1 / p2) + plot_layout(heights=c(0.9,0.1))
## Warning: Removed 1144 row(s) containing missing values (geom_path).

if (params$bounded.growth.confidence.interval) conf = confint.merMod(REM_bounded, c("y0","yf","log_alpha"), method="profile")
if (params$bounded.growth.confidence.interval) {
  print(conf)
  print(exp(conf[3,]))
  print(paste0("Average boundary improvement over baseline: ", round((yf-y0)/y0*100, 1), " (", round((conf[1,1]-conf[2,1])/conf[2,1]*100, 1), "-", round((conf[1,2]-conf[2,2])/conf[2,2]*100, 1), ")"))
}

Bounded growth model

equ_diff_REM_bounded = function(t) exp(log_alpha)*(yf-y0)*exp(exp(log_alpha)*-t)
equ_diff_get_time_REM_bounded = function(target_slope) log(exp(log_alpha)*(yf-y0)/target_slope)/exp(log_alpha)

equ_bounded_get_x = function(target_value) exp(-log_alpha)*log((yf-y0)/(yf-target_value))

growth_percentiles = c(0, 0.5, 0.9)
names_percentiles = c("baseline", "half-practice point", "90% practice")
selected_timepoints = equ_bounded_get_x(y0+(yf-y0)*growth_percentiles)
example_slopes_bounded = data.frame(
  x=selected_timepoints,
  y=equ_bounded(selected_timepoints),
  label=paste0("y=", round(equ_bounded(selected_timepoints),1), ", m=", signif(equ_diff_REM_bounded(selected_timepoints),2), " at ", params$unit_time, " ", round(selected_timepoints,0), ", ", names_percentiles),
  vjust=1.5
)
example_slopes_bounded = rbind(example_slopes_bounded, list(x=0.83*xmax, y=yf, label=paste0("boundary: ", round(yf, 1)), vjust=-1.0))

if (params$bounded.growth.confidence.interval) ribbon = data.frame(x=seq(0,xmax,0.05), ymin=equ_bounded(seq(0,xmax,0.05), conf["yf","2.5 %"], conf["y0","2.5 %"], conf["log_alpha","2.5 %"]), ymax=equ_bounded(seq(0,xmax,0.05), conf["yf","97.5 %"], conf["y0","97.5 %"], conf["log_alpha","97.5 %"]))

quant = quantile(table(data$id))
print(paste0("n tests = ", nrow(data), " (n ", params$unit_n, "s = ", length(unique(data$id)), ", median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 12997 (n patients = 161, median tests per patient: 37, IQR 15-84)"
p1 = ggplot() + geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.2) +
  theme_pubr(base_family="Serif") + scale_x_continuous(limits = c(0,xmax), expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + xlab(params$xlab) + ylab(params$unit) +
  geom_vline(xintercept=example_slopes_bounded[2,"x"], color=linecolor, linetype=2) +
  stat_function(fun=equ_bounded, color=linecolor, size=1) +
  geom_point(data=example_slopes_bounded[1:(nrow(example_slopes_bounded)-1),], aes(x,y), color=linecolor, size=5) +
  geom_text(data=example_slopes_bounded, aes(x,y,label=label, vjust=vjust), color=linecolor, hjust=-0.01, family="Serif")

if (params$bounded.growth.confidence.interval) p1 = p1 + geom_ribbon(aes(x=x, ymin=ymin, ymax=ymax), ribbon, fill=linecolor, alpha=0.3)

p1
## Warning: Removed 852 row(s) containing missing values (geom_path).

Quantile regression

if (is.null(params$censor_after)) {
  censor_after = as.integer(round(selected_timepoints[2])) # half-practice point
} else {
  censor_after = params$censor_after
}
data_censored = data[data$predictor <= censor_after,]

percentiles = c(0.05,0.25,0.5,0.75,0.95)

QR = rq(value ~ predictor, tau=percentiles, data_censored)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
p_vals = sapply(1:length(summary(QR)), function(i) {
  summ = coef(summary(QR, se="ker")[[i]])
  print(summ)
  print(paste0("Intercept: ", round(summ[1,1],1), " (", round(summ[1,1]-1.96*summ[1,2],1), "-", round(summ[1,1]+1.96*summ[1,2],1), "), beta: ", round(summ[2,1],2), " (", round(summ[2,1]-1.96*summ[2,2],2), "-", round(summ[2,1]+1.96*summ[2,2],2), ")"))
  summ[2,4]
})
##             Value Std. Error    t value     Pr(>|t|)
## (Intercept)    99  12.634385  7.8357592 8.437695e-15
## predictor      -1   2.488586 -0.4018346 6.878606e-01
## [1] "Intercept: 99.0 (74.2-123.8), beta: -1.00 (-5.88-3.88)"
##             Value Std. Error   t value  Pr(>|t|)
## (Intercept)   181  4.2033466 43.060927 0.0000000
## predictor       1  0.7578666  1.319493 0.1871971
## [1] "Intercept: 181.0 (172.8-189.2), beta: 1.00 (-0.49-2.49)"
##             Value Std. Error   t value  Pr(>|t|)
## (Intercept) 217.5  2.3848809 91.199525 0.0000000
## predictor     0.5  0.4438119  1.126603 0.2600828
## [1] "Intercept: 217.5 (212.8-222.2), beta: 0.50 (-0.37-1.37)"
##             Value Std. Error    t value  Pr(>|t|)
## (Intercept) 237.0  2.0164631 117.532525 0.0000000
## predictor     0.6  0.3844522   1.560662 0.1188055
## [1] "Intercept: 237.0 (233.0-241.0), beta: 0.60 (-0.15-1.35)"
##             Value Std. Error    t value  Pr(>|t|)
## (Intercept) 262.0  2.1601758 121.286426 0.0000000
## predictor     0.5  0.4307378   1.160799 0.2459006
## [1] "Intercept: 262.0 (257.8-266.2), beta: 0.50 (-0.34-1.34)"
p_vals = p.adjust(p_vals, method="bonferroni")

ANOVA = anova(QR)

quant = quantile(table(data_censored$id))
print(paste0("n tests = ", nrow(data_censored), " (median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 1568 (median tests per patient: 10, IQR 10-10)"
signif_p = function(x, digits=1) {
  x = signif(x, digits)
  if (as.character(x) == "0") return("< 2e-16")
  else return(paste0("= ", x))
}

ggplot() + geom_line(aes_string("predictor", "value", group="id"), data_censored, alpha=0.2, color="darkgrey") + theme_pubr(base_family="Serif") + scale_x_continuous(expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + theme(legend.position = "none") + xlab(params$xlab) + ylab(params$unit) +
  geom_abline(intercept=coef(QR)[1,], slope=coef(QR)[2,], color=linecolor) +
  geom_text(data=data.frame(intercept=coef(QR)[1,], label=paste0(percentiles*100, "th percentile: β = ", round(coef(QR)[2,],1), ", ", "p.adj ", sapply(p_vals,signif_p))),
            mapping=aes(x=1,y=intercept, label=label), color=linecolor, hjust="left", vjust=1, family="Serif") +
  coord_cartesian(xlim=c(0,censor_after)) +
  geom_text(aes(x=x,y=y,label=label), data.frame(x=0.8*censor_after, y=0, label=paste0("ANOVA p ", signif_p(ANOVA$table$pvalue, 1))), vjust=-1.5, family="Serif") +
  geom_vline(xintercept=censor_after, color=linecolor, linetype=2)

Sensitivity analysis 1

Performance as a function of number of repetitions Minimum number of repetitions: 10, minimum number of weeks: 10

Cognition

params = list(
  test_code = "ips",
  test_metric_code = "correct_responses",
  unit = "SDMT: Correct Responses",
  unit_n = "patient",
  unit_time = "repetition",
  min_repetitions = 10,
  min_weeks = 10,
  predictor = "repetition",
  xlab = "Repetitions",
  bounded.growth.confidence.interval = F,
  up_to_date = "2021-05-01"
)
library(data.table) # fread
library(parsedate) # parse_date
library(dplyr) # group_by
library(tibble) # deframe
library(lme4) # lmer
library(mgcv) # gamm
library(quantreg) # rq
library(patchwork) # plot_layout
library(gridExtra) # grid.arrange
library(ggpubr) # ggscatter
library(ggtext) # geom_text
library(sjPlot) # plot_model

# Download from: https://dataset.floodlightopen.com/public-blobs-prod/complete_dataset.csv
data = fread("complete_dataset.csv", data.table=F)

# Prepare dataset
data = data[data$testCode == params$test_code & !data$participantIsControl,]
data$time = parse_date(data$testStartedAt)
data = data[data$time <= as.POSIXct(params$up_to_date, tz="UTC"),] # only analyse data up to (excluding) params$up_to_date
data = data[!duplicated(data),] # sometimes contains true duplicates for some reason (even with the same testResultMetricId)

# For "Finger Pinching" hand_used has to be determined
if (params$test_code == "pinching") {
  library(tidyr) # pivot_wider
  # just one means either "hand" or "successful_pinches" values are missing, remove those
  table(table(data$time))
  data = data[!data$time %in% names(which(table(data$time)==1)), ]
  data = as.data.frame(pivot_wider(data, id_cols=c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time"), names_from="testMetricCode", values_from="testResultMetricValue"))
} else {
  data = data[data$testMetricCode == params$test_metric_code,]
  data$hand_used = NA
  data = data[c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time", "testResultMetricValue", "hand_used")]
}

colnames(data) = c("id", "control", "sex", "birthyear", "time", "value", "hand_used")
data$age = year(data$time)-data$birthyear # Estimate age
data = data[order(as.character(data$id)),]

# 0 result values are discarded
data = data[!is.na(data$value) & data$value != 0,]

# Consider those supposedly younger than 18 (minimum study age) and older than 90 as NA
data$age[data$age < 18 | data$age > 90] = NA

data$id_original = data$id
data$id = paste0(data$id, "_hand", data$hand_used)

data$day = as.IDate(data$time)
round = function(x, digits=0) sprintf(paste0("%.", digits, "f"), x)

no_x = theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
no_y = theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())

linecolor = "#c71138"

Participant selection

# At least x weeks & repetitions
for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "repetition"] = (1:n)-1
  data[subset, "weeksSinceFirst"] = as.numeric(difftime(data[subset, "time"], data[subset, "time"][1], unit="weeks"))
}

n_orig = nrow(data)
n_patients_orig = length(unique(data$id_original))
n_hands_orig = length(unique(data$id))

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(weeksSinceFirst), repetitions=last(repetition), .groups="keep")

Among the total n=1095 patients with n=5715 repetitions, the median length of participation is 0.0 weeks (IQR 0.0-8.6, range 0.0-151.6) and the median number of repetitions is 1 (IQR 1-4, range 1-106).

data = data[data$id %in% participation_duration$id[participation_duration$weeks >= params$min_weeks & participation_duration$repetitions+1 >= params$min_repetitions],]

for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "daysSinceLast"] = as.numeric(difftime(data[subset, "time"], c(data[subset, "time"][1], data[subset, "time"][1:(n-1)]), unit="days"))
}

participation_duration = data %>% group_by(id) %>% summarise(sex=first(sex), mean_age=mean(age), weeks=last(weeksSinceFirst), repetitions=last(repetition), median_intertest_interval=median(daysSinceLast), IQR_intertest_interval=IQR(daysSinceLast), .groups="keep")

data$predictor = data[,params$predictor]

Inclusion criteria: participation for at least 10 weeks and at least 10 repetitions performed per test, leading to the analysis of n=135 / 1095 patients and n=3627 / 5715 tests. Among those, the median length of participation is 38.6 weeks (IQR 15.5-64.2, range 10.0-151.6) and the median number of repetitions is 17 (IQR 13.5-33, range 10-106).

t(data.frame(
  n_patients = paste0(length(unique(data$id_original)), " / ", n_patients_orig, " (", round(length(unique(data$id_original))/n_patients_orig*100,1), "%)"),
  n_hands = paste0(length(unique(data$id)), " / ", n_hands_orig, " (", round(length(unique(data$id))/n_hands_orig*100,1), "%)"),
  n_tests = paste0(nrow(data), " / ", n_orig, " (", round(nrow(data)/n_orig*100,1), "%)"),
  percent_female = paste0(round(prop.table(table(participation_duration$sex == "female"))[[2]]*100, 1)),
  age = paste0(round(median(participation_duration$mean_age,na.rm=T),1), " (", round(quantile(participation_duration$mean_age, 0.25, na.rm=T),1), "-", round(quantile(participation_duration$mean_age, 0.75, na.rm=T),1), ", range ", round(min(participation_duration$mean_age, na.rm=T),1), "-", round(max(participation_duration$mean_age, na.rm=T),1), ")"),
  repetitions = paste0(median(participation_duration$repetitions)+1, " repetitions (IQR ", quantile(participation_duration$repetitions+1, 0.25), "-", quantile(participation_duration$repetitions+1, 0.75), ", range ", min(participation_duration$repetitions+1), "-", max(participation_duration$repetitions+1), ")"),
  median_intertest_interval = paste0(round(median(participation_duration$median_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$median_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$median_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$median_intertest_interval),1), "-", round(max(participation_duration$median_intertest_interval),1), ")"),
  IQR_intertest_interval = paste0(round(median(participation_duration$IQR_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$IQR_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$IQR_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$IQR_intertest_interval),1), "-", round(max(participation_duration$IQR_intertest_interval),1), ")"),
  weeks = paste0(round(median(participation_duration$weeks),1), " weeks (IQR ", round(quantile(participation_duration$weeks, 0.25),1), "-", round(quantile(participation_duration$weeks, 0.75),1), ", range ", round(min(participation_duration$weeks),1), "-", round(max(participation_duration$weeks),1), ")")
))
##                           [,1]                                          
## n_patients                "135 / 1095 (12.3%)"                          
## n_hands                   "135 / 1095 (12.3%)"                          
## n_tests                   "3627 / 5715 (63.5%)"                         
## percent_female            "68.1"                                        
## age                       "51.1 (43.3-57.9, range 22.9-74.3)"           
## repetitions               "17 repetitions (IQR 13.5-33, range 10-106)"  
## median_intertest_interval "7.1 days (IQR 7.0-8.8, range 6.7-38.8)"      
## IQR_intertest_interval    "1.7 days (IQR 0.4-5.8, range 0.0-46.0)"      
## weeks                     "38.6 weeks (IQR 15.5-64.2, range 10.0-151.6)"

Summary level analysis

Difference test

df = as.data.frame(data %>% group_by(id) %>% summarise(first=first(value), last=last(value), mean=mean(value), weeksSinceFirst=max(weeksSinceFirst), repetition=n(), first_age=first(age), last_age=last(age), mean_age=mean(age), .groups="keep") %>% mutate(diff=last-first))

df$predictor = df[, params$predictor]

test = t.test(df$last, df$first, paired=T)
test
## 
##  Paired t-test
## 
## data:  df$last and df$first
## t = 19.262, df = 134, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  10.31587 12.67672
## sample estimates:
## mean of the differences 
##                 11.4963
mod0 = lm(diff ~ I(mean_age/10) + I(first/10), df)
summ0 = summary(mod0)
summ0
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.2315  -4.0548  -0.1762   3.9750  22.8922 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     24.3737     5.7055   4.272 3.70e-05 ***
## I(mean_age/10)  -0.2776     0.6852  -0.405    0.686    
## I(first/10)     -2.9466     0.7311  -4.030 9.39e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.469 on 131 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1492, Adjusted R-squared:  0.1362 
## F-statistic: 11.48 on 2 and 131 DF,  p-value: 2.541e-05
mod = lm(diff ~ I(mean_age/10) + I(first/10) + log10(predictor), df)
confint(mod)
##                      2.5 %      97.5 %
## (Intercept)       6.260899 27.86718094
## I(mean_age/10)   -2.616447  0.01271983
## I(first/10)      -4.537473 -1.86488564
## log10(predictor)  6.045947 14.13346888
summ = summary(mod)
summ
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10) + log10(predictor), 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.3454  -3.8305   0.1391   4.3111  21.4638 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       17.0640     5.4606   3.125  0.00219 ** 
## I(mean_age/10)    -1.3019     0.6645  -1.959  0.05223 .  
## I(first/10)       -3.2012     0.6754  -4.739 5.54e-06 ***
## log10(predictor)  10.0897     2.0440   4.936 2.39e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.959 on 130 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2835, Adjusted R-squared:  0.2669 
## F-statistic: 17.14 on 3 and 130 DF,  p-value: 1.932e-09
# additional variance explained by predictor
#print(summ$r.squared - summ0$r.squared)

print(paste0("Average observed improvement over baseline: ", round(test$estimate/mean(df$first)*100, 1), " (", round(test$conf[1]/mean(df$first)*100, 1), "-", round(test$conf[2]/mean(df$first)*100, 1), ")"))
## [1] "Average observed improvement over baseline: 29.6 (26.5-32.6)"
lab.y = 1.1*mean(df$last)

p1 = ggbarplot(data.frame(Timepoint=rep(c("First","Mean","Last"),each=nrow(df)), value=c(df$first,df$mean,df$last)), "Timepoint", "value", add="mean_se", label=T, lab.nb.digits=1, lab.vjust=1.9, ylab=params$unit) + xlab("Score") #+ stat_compare_means(comparisons = list(c("First","Last")), paired=T, method="t.test", label.y=lab.y) + scale_y_continuous(expand=expansion(mult=c(0,0.1)))

p2 = plot_model(summ, show.values=T, vline.color = "grey", show.intercept=T, colors=linecolor, title=paste0("Difference from First to Last Score, R²=", round(summ$r.squared, 2)), axis.labels=rev(c("Intercept", "Age (per 10 years)", "First score (per 10)", paste0(params$xlab, " (log 10)"))), value.offset=0.3, show.p=F) + ylab("β estimates")

(p1 + p2) + plot_layout(widths=c(2,5)) & theme_pubr(base_family="Serif")

Confounders

p_age_first = ggscatter(df, "mean_age", "first", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("First score") + theme_pubr(base_family="Serif")

p_age_pred = ggscatter(df, "mean_age", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Mean age") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_age_last = ggscatter(df, "mean_age", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Last score") + theme_pubr(base_family="Serif")

p_age_diff = ggscatter(df, "mean_age", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_first_pred = ggscatter(df, "first", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("First score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_first_last = ggscatter(df, "first", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_first_diff = ggscatter(df, "first", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_pred_last = ggscatter(df, "predictor", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Last score") + theme_pubr(base_family="Serif")

p_pred_diff = ggscatter(df, "predictor", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_last_diff = ggscatter(df, "last", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Last score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_last_pred = ggscatter(df, "last", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Last score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")


p_age = gghistogram(df, "mean_age", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_first = gghistogram(df, "first", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_last = gghistogram(df, "last", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_pred = gghistogram(df, "predictor", bins=15) + scale_x_log10() + xlab(NULL) + theme_pubr(base_family="Serif")
p_diff = gghistogram(df, "diff", bins=15) + xlab("Difference first to last") + theme_pubr(base_family="Serif")

#(((p1+xlab(NULL)) + (p2+xlab(NULL)+ylab(NULL))) / ((p3+xlab(NULL)) + (p4+xlab(NULL)+ylab(NULL))) / ((p5) | (p6+ylab(NULL)))) & theme_pubr(base_family="Serif")

#(p_age_first | p_first) / (p_age_last | p_first_last | p_last) / (p_age_pred | p_first_pred | p_last_pred | p_pred) / (p_age_diff | p_first_diff | p_last_diff | p_pred_diff)

m <- matrix(NA, 5, 5)
m[lower.tri(m, diag = T)] <- 1:15
grid.arrange(grobs=list(
  p_age, p_age_first+xlab(NULL), p_age_pred+xlab(NULL), p_age_last+xlab(NULL), p_age_diff,
  p_first, p_first_pred+xlab(NULL)+ylab(""), p_first_last+xlab(NULL)+ylab(""), p_first_diff+ylab(""),
  p_pred, p_pred_last+xlab(NULL)+ylab(""), p_pred_diff+ylab(""),
  p_last, p_last_diff+ylab(""), 
  p_diff
), layout_matrix=m, heights=c(1,1,1,1,1.1))
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#GGally::ggpairs(df[c("mean_age", "first", "last", "predictor", "diff")])
#pairs(df[c("mean_age", "first", "last", "predictor", "diff")], upper.panel=NULL)
#corrplot::corrplot(cor(df[c("mean_age", "first", "last", "predictor", "diff")], use="complete.obs"))

Learning curve: Model selection

smoothing_spline = gamm(value ~ s(predictor, bs="ps"), random=list(id=~1), data=data)
summary(smoothing_spline$lme)
## Linear mixed-effects model fit by maximum likelihood
##  Data: strip.offset(mf) 
##        AIC      BIC    logLik
##   20124.48 20155.46 -10057.24
## 
## Random effects:
##  Formula: ~Xr - 1 | g
##  Structure: pdIdnot
##              Xr1      Xr2      Xr3      Xr4      Xr5      Xr6      Xr7      Xr8
## StdDev: 5.193948 5.193948 5.193948 5.193948 5.193948 5.193948 5.193948 5.193948
## 
##  Formula: ~1 | id %in% g
##         (Intercept) Residual
## StdDev:    8.979859 3.508465
## 
## Fixed effects: y ~ X - 1 
##                     Value Std.Error   DF  t-value p-value
## X(Intercept)     49.07007  0.776989 3491 63.15416       0
## Xs(predictor)Fx1 31.80627  7.610944 3491  4.17902       0
##  Correlation: 
##                  X(Int)
## Xs(predictor)Fx1 0.002 
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -8.0520741 -0.5540131  0.0472338  0.6173327  4.6393189 
## 
## Number of Observations: 3627
## Number of Groups: 
##         g id %in% g 
##         1       135
summary(smoothing_spline$gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## value ~ s(predictor, bs = "ps")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  49.0701     0.7769   63.16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df   F p-value    
## s(predictor) 8.098  8.098 346  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.105   
##   Scale est. = 12.309    n = 3627
REM_linear = lmer(value ~ (1|id) + predictor, data)
equ_linear = function(t) fixef(REM_linear)[1] + fixef(REM_linear)[2]*t
summary(REM_linear)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor
##    Data: data
## 
## REML criterion at convergence: 20876
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.7916 -0.5484  0.0860  0.6276  4.3273 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 80.12    8.951   
##  Residual             15.49    3.936   
## Number of obs: 3627, groups:  id, 135
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) 45.309835   0.776676   58.34
## predictor    0.178547   0.004591   38.89
## 
## Correlation of Fixed Effects:
##           (Intr)
## predictor -0.077
REM_quadratic = lmer(value ~ (1|id) + predictor + I(predictor^2), data)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
equ_quadratic = function(t) fixef(REM_quadratic)[1] + fixef(REM_quadratic)[2]*t + fixef(REM_quadratic)[3]*t^2
summary(REM_quadratic)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor + I(predictor^2)
##    Data: data
## 
## REML criterion at convergence: 20540
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.0429 -0.5491  0.0598  0.6245  4.2671 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 81.2     9.011   
##  Residual             14.0     3.742   
## Number of obs: 3627, groups:  id, 135
## 
## Fixed effects:
##                  Estimate Std. Error t value
## (Intercept)    43.8243668  0.7850215   55.83
## predictor       0.3720297  0.0109614   33.94
## I(predictor^2) -0.0028201  0.0001466  -19.24
## 
## Correlation of Fixed Effects:
##             (Intr) prdctr
## predictor   -0.119       
## I(prdctr^2)  0.098 -0.917
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
REM_bounded = nlmer(value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0|id) + (yf|id), data = data, start=c(yf=40, y0=20, log_alpha=-1))
y0=fixef(REM_bounded)["y0"]
yf=fixef(REM_bounded)["yf"]
log_alpha=fixef(REM_bounded)["log_alpha"]
equ_bounded = function(t, yf=fixef(REM_bounded)[["yf"]], y0=fixef(REM_bounded)[["y0"]], log_alpha=fixef(REM_bounded)[["log_alpha"]]) yf+(y0-yf)*exp(-exp(log_alpha)*t)
summary(REM_bounded)
## Nonlinear mixed model fit by maximum likelihood  ['nlmerMod']
## Formula: value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0 | id) + (yf |  
##     id)
##    Data: data
## 
##      AIC      BIC   logLik deviance df.resid 
##  20183.9  20221.1 -10086.0  20171.9     3621 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.2046 -0.5411  0.0572  0.6119  3.9848 
## 
## Random effects:
##  Groups   Name Variance Std.Dev.
##  id       y0    78.62    8.867  
##  id.1     yf   120.31   10.968  
##  Residual       11.59    3.405  
## Number of obs: 3627, groups:  id, 135
## 
## Fixed effects:
##           Estimate Std. Error t value
## yf        55.20086    1.09285   50.51
## y0        41.67523    0.78578   53.04
## log_alpha -2.71040    0.07266  -37.30
## 
## Correlation of Fixed Effects:
##           yf     y0    
## y0         0.037       
## log_alpha -0.448 -0.147
exp(log_alpha)
##  log_alpha 
## 0.06651013
cat("Average improvement over baseline: ", (yf-y0)/y0*100)
## Average improvement over baseline:  32.45486
RMSE = sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) sqrt(mean(resid(mod)^2))) # RMSE
RMSE
## [1] 3.861949 3.671241 3.439287 3.282036
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) mean(abs(resid(mod)))) # MAE
## [1] 2.916171 2.762293 2.577515 2.482498
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) extractAIC(mod)) # edf & AIC: smoothing_spline$lme always has edf 5
##          [,1]     [,2]     [,3]     [,4]
## [1,]     4.00     5.00     5.00     6.00
## [2,] 20876.41 20526.53 20124.48 20183.92
edf = sapply(list(REM_linear, REM_quadratic, smoothing_spline$gam, REM_bounded), function(mod) { nrow(data)-df.residual(mod) }) # while smoothing_spline$gam often has much higher edf
edf
## [1] 4.000000 5.000000 9.097696 6.000000
RMSE = round(RMSE,1)
edf = round(edf,1)

Plot

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(predictor), .groups="keep") %>% deframe()
remaining_participants_bins = seq(0,350,by=10)
remaining_participants = data.frame(x=remaining_participants_bins, text=sapply(remaining_participants_bins, function(x) sum(participation_duration>=x)))

xmax = remaining_participants$x[which(remaining_participants$text<10)[1]]

range_90p =quantile(data$value, probs=c(0.05,0.95))

p1 = ggplot() +
  geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.1) +
  geom_line(aes(x,y), data.frame(x=0:xmax, y=predict(smoothing_spline$gam, newdata=data.frame(predictor=0:xmax))), linetype="longdash", size=1) + xlim(0,xmax) + ylim(range_90p[1], range_90p[2]) +
  stat_function(fun=equ_linear, color="blue", linetype="dotted", size=1) + 
  stat_function(fun=equ_quadratic, color="green4", linetype="dashed", size=1) + 
  stat_function(fun=equ_bounded, color=linecolor, size=1) + 
  theme_pubr(base_family="Serif") + no_x + xlab(NULL) + ylab(params$unit) +
  geom_richtext(aes(x,y,label=label,hjust=1), data.frame(x=0.8*xmax, y=range_90p[2]-(range_90p[2]-range_90p[1])/1.3, label=paste0("Model RMSE (edf):<br><span style='color:#0000ff'>····· Linear: ", RMSE[1], " (", edf[1], ") </span><br><span style='color:#008b00'>- - - Quadratic: ", RMSE[2], " (", edf[2], ") </span><br><span style='color:#000000'>— — Smoothing spline: ", RMSE[3], " (", edf[3], ") </span><br><span style='color:", linecolor, "'>— Bounded growth: ", RMSE[4], " (", edf[4], ") </span>")), family="Serif")

p2 = ggplot(remaining_participants[remaining_participants$x<=xmax,]) +
  geom_text(aes(x=x,y="A",label=text), family="Serif") +
  theme_pubr(base_family="Serif") + no_y + xlab(params$xlab) + ylab(paste0("Remaining \n ", params$unit_n, "s")) +
  scale_x_continuous(breaks=seq(0,xmax,by=10))

(p1 / p2) + plot_layout(heights=c(0.9,0.1))
## Warning: Removed 244 row(s) containing missing values (geom_path).

if (params$bounded.growth.confidence.interval) conf = confint.merMod(REM_bounded, c("y0","yf","log_alpha"), method="profile")
if (params$bounded.growth.confidence.interval) {
  print(conf)
  print(exp(conf[3,]))
  print(paste0("Average boundary improvement over baseline: ", round((yf-y0)/y0*100, 1), " (", round((conf[1,1]-conf[2,1])/conf[2,1]*100, 1), "-", round((conf[1,2]-conf[2,2])/conf[2,2]*100, 1), ")"))
}

Bounded growth model

equ_diff_REM_bounded = function(t) exp(log_alpha)*(yf-y0)*exp(exp(log_alpha)*-t)
equ_diff_get_time_REM_bounded = function(target_slope) log(exp(log_alpha)*(yf-y0)/target_slope)/exp(log_alpha)

equ_bounded_get_x = function(target_value) exp(-log_alpha)*log((yf-y0)/(yf-target_value))

growth_percentiles = c(0, 0.5, 0.9)
names_percentiles = c("baseline", "half-practice point", "90% practice")
selected_timepoints = equ_bounded_get_x(y0+(yf-y0)*growth_percentiles)
example_slopes_bounded = data.frame(
  x=selected_timepoints,
  y=equ_bounded(selected_timepoints),
  label=paste0("y=", round(equ_bounded(selected_timepoints),1), ", m=", signif(equ_diff_REM_bounded(selected_timepoints),2), " at ", params$unit_time, " ", round(selected_timepoints,0), ", ", names_percentiles),
  vjust=1.5
)
example_slopes_bounded = rbind(example_slopes_bounded, list(x=0.83*xmax, y=yf, label=paste0("boundary: ", round(yf, 1)), vjust=-1.0))

if (params$bounded.growth.confidence.interval) ribbon = data.frame(x=seq(0,xmax,0.05), ymin=equ_bounded(seq(0,xmax,0.05), conf["yf","2.5 %"], conf["y0","2.5 %"], conf["log_alpha","2.5 %"]), ymax=equ_bounded(seq(0,xmax,0.05), conf["yf","97.5 %"], conf["y0","97.5 %"], conf["log_alpha","97.5 %"]))

quant = quantile(table(data$id))
print(paste0("n tests = ", nrow(data), " (n ", params$unit_n, "s = ", length(unique(data$id)), ", median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 3627 (n patients = 135, median tests per patient: 17, IQR 13.5-33)"
p1 = ggplot() + geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.2) +
  theme_pubr(base_family="Serif") + scale_x_continuous(limits = c(0,xmax), expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + xlab(params$xlab) + ylab(params$unit) +
  geom_vline(xintercept=example_slopes_bounded[2,"x"], color=linecolor, linetype=2) +
  stat_function(fun=equ_bounded, color=linecolor, size=1) +
  geom_point(data=example_slopes_bounded[1:(nrow(example_slopes_bounded)-1),], aes(x,y), color=linecolor, size=5) +
  geom_text(data=example_slopes_bounded, aes(x,y,label=label, vjust=vjust), color=linecolor, hjust=-0.01, family="Serif")

if (params$bounded.growth.confidence.interval) p1 = p1 + geom_ribbon(aes(x=x, ymin=ymin, ymax=ymax), ribbon, fill=linecolor, alpha=0.3)

p1
## Warning: Removed 91 row(s) containing missing values (geom_path).

Quantile regression

if (is.null(params$censor_after)) {
  censor_after = as.integer(round(selected_timepoints[2])) # half-practice point
} else {
  censor_after = params$censor_after
}
data_censored = data[data$predictor <= censor_after,]

percentiles = c(0.05,0.25,0.5,0.75,0.95)

QR = rq(value ~ predictor, tau=percentiles, data_censored)

p_vals = sapply(1:length(summary(QR)), function(i) {
  summ = coef(summary(QR, se="ker")[[i]])
  print(summ)
  print(paste0("Intercept: ", round(summ[1,1],1), " (", round(summ[1,1]-1.96*summ[1,2],1), "-", round(summ[1,1]+1.96*summ[1,2],1), "), beta: ", round(summ[2,1],2), " (", round(summ[2,1]-1.96*summ[2,2],2), "-", round(summ[2,1]+1.96*summ[2,2],2), ")"))
  summ[2,4]
})
##                  Value Std. Error   t value    Pr(>|t|)
## (Intercept) 25.4285714  0.9461731 26.875178 0.00000e+00
## predictor    0.8571429  0.1507008  5.687714 1.54961e-08
## [1] "Intercept: 25.4 (23.6-27.3), beta: 0.86 (0.56-1.15)"
##                  Value Std. Error   t value     Pr(>|t|)
## (Intercept) 35.8571429  0.6202390 57.811815 0.000000e+00
## predictor    0.7142857  0.1026206  6.960452 5.084377e-12
## [1] "Intercept: 35.9 (34.6-37.1), beta: 0.71 (0.51-0.92)"
##             Value Std. Error  t value     Pr(>|t|)
## (Intercept) 42.00  0.6096602 68.89083 0.000000e+00
## predictor    0.75  0.1058070  7.08838 2.095213e-12
## [1] "Intercept: 42.0 (40.8-43.2), beta: 0.75 (0.54-0.96)"
##                  Value Std. Error   t value     Pr(>|t|)
## (Intercept) 47.6666667  0.6508036 73.242786 0.000000e+00
## predictor    0.8333333  0.1131985  7.361696 3.002043e-13
## [1] "Intercept: 47.7 (46.4-48.9), beta: 0.83 (0.61-1.06)"
##             Value Std. Error   t value    Pr(>|t|)
## (Intercept)    56  0.9081981 61.660554 0.00000e+00
## predictor       1  0.1777493  5.625902 2.20416e-08
## [1] "Intercept: 56.0 (54.2-57.8), beta: 1.00 (0.65-1.35)"
p_vals = p.adjust(p_vals, method="bonferroni")

ANOVA = anova(QR)

quant = quantile(table(data_censored$id))
print(paste0("n tests = ", nrow(data_censored), " (median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 1479 (median tests per patient: 11, IQR 11-11)"
signif_p = function(x, digits=1) {
  x = signif(x, digits)
  if (as.character(x) == "0") return("< 2e-16")
  else return(paste0("= ", x))
}

ggplot() + geom_line(aes_string("predictor", "value", group="id"), data_censored, alpha=0.2, color="darkgrey") + theme_pubr(base_family="Serif") + scale_x_continuous(expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + theme(legend.position = "none") + xlab(params$xlab) + ylab(params$unit) +
  geom_abline(intercept=coef(QR)[1,], slope=coef(QR)[2,], color=linecolor) +
  geom_text(data=data.frame(intercept=coef(QR)[1,], label=paste0(percentiles*100, "th percentile: β = ", round(coef(QR)[2,],1), ", ", "p.adj ", sapply(p_vals,signif_p))),
            mapping=aes(x=1,y=intercept, label=label), color=linecolor, hjust="left", vjust=1, family="Serif") +
  coord_cartesian(xlim=c(0,censor_after)) +
  geom_text(aes(x=x,y=y,label=label), data.frame(x=0.8*censor_after, y=0, label=paste0("ANOVA p ", signif_p(ANOVA$table$pvalue, 1))), vjust=-1.5, family="Serif") +
  geom_vline(xintercept=censor_after, color=linecolor, linetype=2)

Dexterity

params = list(
  test_code = "pinching",
  test_metric_code = "successful_pinches",
  unit = "Pinching: Successful Pinches",
  unit_n = "hand",
  unit_time = "repetition",
  min_repetitions = 10,
  min_weeks = 10,
  predictor = "repetition",
  xlab = "Repetitions",
  bounded.growth.confidence.interval = F,
  up_to_date = "2021-05-01"
)
library(data.table) # fread
library(parsedate) # parse_date
library(dplyr) # group_by
library(tibble) # deframe
library(lme4) # lmer
library(mgcv) # gamm
library(quantreg) # rq
library(patchwork) # plot_layout
library(gridExtra) # grid.arrange
library(ggpubr) # ggscatter
library(ggtext) # geom_text
library(sjPlot) # plot_model

# Download from: https://dataset.floodlightopen.com/public-blobs-prod/complete_dataset.csv
data = fread("complete_dataset.csv", data.table=F)

# Prepare dataset
data = data[data$testCode == params$test_code & !data$participantIsControl,]
data$time = parse_date(data$testStartedAt)
data = data[data$time <= as.POSIXct(params$up_to_date, tz="UTC"),] # only analyse data up to (excluding) params$up_to_date
data = data[!duplicated(data),] # sometimes contains true duplicates for some reason (even with the same testResultMetricId)

# For "Finger Pinching" hand_used has to be determined
if (params$test_code == "pinching") {
  library(tidyr) # pivot_wider
  # just one means either "hand" or "successful_pinches" values are missing, remove those
  table(table(data$time))
  data = data[!data$time %in% names(which(table(data$time)==1)), ]
  data = as.data.frame(pivot_wider(data, id_cols=c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time"), names_from="testMetricCode", values_from="testResultMetricValue"))
} else {
  data = data[data$testMetricCode == params$test_metric_code,]
  data$hand_used = NA
  data = data[c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time", "testResultMetricValue", "hand_used")]
}

colnames(data) = c("id", "control", "sex", "birthyear", "time", "value", "hand_used")
data$age = year(data$time)-data$birthyear # Estimate age
data = data[order(as.character(data$id)),]

# 0 result values are discarded
data = data[!is.na(data$value) & data$value != 0,]

# Consider those supposedly younger than 18 (minimum study age) and older than 90 as NA
data$age[data$age < 18 | data$age > 90] = NA

data$id_original = data$id
data$id = paste0(data$id, "_hand", data$hand_used)

data$day = as.IDate(data$time)
round = function(x, digits=0) sprintf(paste0("%.", digits, "f"), x)

no_x = theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
no_y = theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())

linecolor = "#c71138"

Participant selection

# At least x weeks & repetitions
for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "repetition"] = (1:n)-1
  data[subset, "weeksSinceFirst"] = as.numeric(difftime(data[subset, "time"], data[subset, "time"][1], unit="weeks"))
}

n_orig = nrow(data)
n_patients_orig = length(unique(data$id_original))
n_hands_orig = length(unique(data$id))

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(weeksSinceFirst), repetitions=last(repetition), .groups="keep")

Among the total n=1816 patients with n=20695 repetitions, the median length of participation is 0.6 weeks (IQR 0.0-8.1, range 0.0-151.7) and the median number of repetitions is 2 (IQR 1-7, range 1-370).

data = data[data$id %in% participation_duration$id[participation_duration$weeks >= params$min_weeks & participation_duration$repetitions+1 >= params$min_repetitions],]

for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "daysSinceLast"] = as.numeric(difftime(data[subset, "time"], c(data[subset, "time"][1], data[subset, "time"][1:(n-1)]), unit="days"))
}

participation_duration = data %>% group_by(id) %>% summarise(sex=first(sex), mean_age=mean(age), weeks=last(weeksSinceFirst), repetitions=last(repetition), median_intertest_interval=median(daysSinceLast), IQR_intertest_interval=IQR(daysSinceLast), .groups="keep")

data$predictor = data[,params$predictor]

Inclusion criteria: participation for at least 10 weeks and at least 10 repetitions performed per test, leading to the analysis of n=150 / 1059 patients , 289 / 1816 hands and n=16107 / 20695 tests. Among those, the median length of participation is 28.2 weeks (IQR 15.3-57.9, range 10.0-151.7) and the median number of repetitions is 35 (IQR 18-59, range 10-370).

t(data.frame(
  n_patients = paste0(length(unique(data$id_original)), " / ", n_patients_orig, " (", round(length(unique(data$id_original))/n_patients_orig*100,1), "%)"),
  n_hands = paste0(length(unique(data$id)), " / ", n_hands_orig, " (", round(length(unique(data$id))/n_hands_orig*100,1), "%)"),
  n_tests = paste0(nrow(data), " / ", n_orig, " (", round(nrow(data)/n_orig*100,1), "%)"),
  percent_female = paste0(round(prop.table(table(participation_duration$sex == "female"))[[2]]*100, 1)),
  age = paste0(round(median(participation_duration$mean_age,na.rm=T),1), " (", round(quantile(participation_duration$mean_age, 0.25, na.rm=T),1), "-", round(quantile(participation_duration$mean_age, 0.75, na.rm=T),1), ", range ", round(min(participation_duration$mean_age, na.rm=T),1), "-", round(max(participation_duration$mean_age, na.rm=T),1), ")"),
  repetitions = paste0(median(participation_duration$repetitions)+1, " repetitions (IQR ", quantile(participation_duration$repetitions+1, 0.25), "-", quantile(participation_duration$repetitions+1, 0.75), ", range ", min(participation_duration$repetitions+1), "-", max(participation_duration$repetitions+1), ")"),
  median_intertest_interval = paste0(round(median(participation_duration$median_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$median_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$median_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$median_intertest_interval),1), "-", round(max(participation_duration$median_intertest_interval),1), ")"),
  IQR_intertest_interval = paste0(round(median(participation_duration$IQR_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$IQR_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$IQR_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$IQR_intertest_interval),1), "-", round(max(participation_duration$IQR_intertest_interval),1), ")"),
  weeks = paste0(round(median(participation_duration$weeks),1), " weeks (IQR ", round(quantile(participation_duration$weeks, 0.25),1), "-", round(quantile(participation_duration$weeks, 0.75),1), ", range ", round(min(participation_duration$weeks),1), "-", round(max(participation_duration$weeks),1), ")")
))
##                           [,1]                                          
## n_patients                "150 / 1059 (14.2%)"                          
## n_hands                   "289 / 1816 (15.9%)"                          
## n_tests                   "16107 / 20695 (77.8%)"                       
## percent_female            "70.2"                                        
## age                       "51.1 (43.4-58.1, range 20.0-74.4)"           
## repetitions               "35 repetitions (IQR 18-59, range 10-370)"    
## median_intertest_interval "2.9 days (IQR 2.1-4.2, range 1.9-37.0)"      
## IQR_intertest_interval    "2.2 days (IQR 0.7-5.6, range 0.0-61.1)"      
## weeks                     "28.2 weeks (IQR 15.3-57.9, range 10.0-151.7)"

Summary level analysis

Difference test

df = as.data.frame(data %>% group_by(id) %>% summarise(first=first(value), last=last(value), mean=mean(value), weeksSinceFirst=max(weeksSinceFirst), repetition=n(), first_age=first(age), last_age=last(age), mean_age=mean(age), .groups="keep") %>% mutate(diff=last-first))

df$predictor = df[, params$predictor]

test = t.test(df$last, df$first, paired=T)
test
## 
##  Paired t-test
## 
## data:  df$last and df$first
## t = 20.174, df = 288, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  16.32819 19.85866
## sample estimates:
## mean of the differences 
##                18.09343
mod0 = lm(diff ~ I(mean_age/10) + I(first/10), df)
summ0 = summary(mod0)
summ0
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.974  -9.201   0.863  10.347  28.446 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     43.1418     4.8569   8.883  < 2e-16 ***
## I(mean_age/10)  -2.1949     0.8066  -2.721  0.00691 ** 
## I(first/10)     -5.4520     0.6530  -8.349 3.07e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.73 on 284 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1978, Adjusted R-squared:  0.1922 
## F-statistic: 35.02 on 2 and 284 DF,  p-value: 2.542e-14
mod = lm(diff ~ I(mean_age/10) + I(first/10) + log10(predictor), df)
confint(mod)
##                      2.5 %    97.5 %
## (Intercept)      17.832448 37.100494
## I(mean_age/10)   -5.082393 -2.093342
## I(first/10)      -6.988442 -4.632970
## log10(predictor) 11.206478 19.068981
summ = summary(mod)
summ
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10) + log10(predictor), 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.940  -7.504   0.661   8.968  24.909 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       27.4665     4.8944   5.612 4.77e-08 ***
## I(mean_age/10)    -3.5879     0.7593  -4.725 3.62e-06 ***
## I(first/10)       -5.8107     0.5983  -9.712  < 2e-16 ***
## log10(predictor)  15.1377     1.9972   7.579 4.98e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.54 on 283 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3332, Adjusted R-squared:  0.3261 
## F-statistic: 47.14 on 3 and 283 DF,  p-value: < 2.2e-16
# additional variance explained by predictor
#print(summ$r.squared - summ0$r.squared)

print(paste0("Average observed improvement over baseline: ", round(test$estimate/mean(df$first)*100, 1), " (", round(test$conf[1]/mean(df$first)*100, 1), "-", round(test$conf[2]/mean(df$first)*100, 1), ")"))
## [1] "Average observed improvement over baseline: 70.5 (63.6-77.4)"
lab.y = 1.1*mean(df$last)

p1 = ggbarplot(data.frame(Timepoint=rep(c("First","Mean","Last"),each=nrow(df)), value=c(df$first,df$mean,df$last)), "Timepoint", "value", add="mean_se", label=T, lab.nb.digits=1, lab.vjust=1.9, ylab=params$unit) + xlab("Score") #+ stat_compare_means(comparisons = list(c("First","Last")), paired=T, method="t.test", label.y=lab.y) + scale_y_continuous(expand=expansion(mult=c(0,0.1)))

p2 = plot_model(summ, show.values=T, vline.color = "grey", show.intercept=T, colors=linecolor, title=paste0("Difference from First to Last Score, R²=", round(summ$r.squared, 2)), axis.labels=rev(c("Intercept", "Age (per 10 years)", "First score (per 10)", paste0(params$xlab, " (log 10)"))), value.offset=0.3, show.p=F) + ylab("β estimates")

(p1 + p2) + plot_layout(widths=c(2,5)) & theme_pubr(base_family="Serif")

Confounders

p_age_first = ggscatter(df, "mean_age", "first", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("First score") + theme_pubr(base_family="Serif")

p_age_pred = ggscatter(df, "mean_age", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Mean age") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_age_last = ggscatter(df, "mean_age", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Last score") + theme_pubr(base_family="Serif")

p_age_diff = ggscatter(df, "mean_age", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_first_pred = ggscatter(df, "first", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("First score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_first_last = ggscatter(df, "first", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_first_diff = ggscatter(df, "first", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_pred_last = ggscatter(df, "predictor", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Last score") + theme_pubr(base_family="Serif")

p_pred_diff = ggscatter(df, "predictor", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_last_diff = ggscatter(df, "last", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Last score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_last_pred = ggscatter(df, "last", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Last score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")


p_age = gghistogram(df, "mean_age", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_first = gghistogram(df, "first", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_last = gghistogram(df, "last", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_pred = gghistogram(df, "predictor", bins=15) + scale_x_log10() + xlab(NULL) + theme_pubr(base_family="Serif")
p_diff = gghistogram(df, "diff", bins=15) + xlab("Difference first to last") + theme_pubr(base_family="Serif")

#(((p1+xlab(NULL)) + (p2+xlab(NULL)+ylab(NULL))) / ((p3+xlab(NULL)) + (p4+xlab(NULL)+ylab(NULL))) / ((p5) | (p6+ylab(NULL)))) & theme_pubr(base_family="Serif")

#(p_age_first | p_first) / (p_age_last | p_first_last | p_last) / (p_age_pred | p_first_pred | p_last_pred | p_pred) / (p_age_diff | p_first_diff | p_last_diff | p_pred_diff)

m <- matrix(NA, 5, 5)
m[lower.tri(m, diag = T)] <- 1:15
grid.arrange(grobs=list(
  p_age, p_age_first+xlab(NULL), p_age_pred+xlab(NULL), p_age_last+xlab(NULL), p_age_diff,
  p_first, p_first_pred+xlab(NULL)+ylab(""), p_first_last+xlab(NULL)+ylab(""), p_first_diff+ylab(""),
  p_pred, p_pred_last+xlab(NULL)+ylab(""), p_pred_diff+ylab(""),
  p_last, p_last_diff+ylab(""), 
  p_diff
), layout_matrix=m, heights=c(1,1,1,1,1.1))
## Warning: Removed 2 rows containing non-finite values (stat_bin).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#GGally::ggpairs(df[c("mean_age", "first", "last", "predictor", "diff")])
#pairs(df[c("mean_age", "first", "last", "predictor", "diff")], upper.panel=NULL)
#corrplot::corrplot(cor(df[c("mean_age", "first", "last", "predictor", "diff")], use="complete.obs"))

Learning curve: Model selection

smoothing_spline = gamm(value ~ s(predictor, bs="ps"), random=list(id=~1), data=data)
summary(smoothing_spline$lme)
## Linear mixed-effects model fit by maximum likelihood
##  Data: strip.offset(mf) 
##        AIC      BIC    logLik
##   112739.3 112777.8 -56364.66
## 
## Random effects:
##  Formula: ~Xr - 1 | g
##  Structure: pdIdnot
##              Xr1      Xr2      Xr3      Xr4      Xr5      Xr6      Xr7      Xr8
## StdDev: 7.454274 7.454274 7.454274 7.454274 7.454274 7.454274 7.454274 7.454274
## 
##  Formula: ~1 | id %in% g
##         (Intercept) Residual
## StdDev:    12.39119 7.674355
## 
## Fixed effects: y ~ X - 1 
##                     Value Std.Error    DF  t-value p-value
## X(Intercept)     43.45128  0.736471 15817 58.99930       0
## Xs(predictor)Fx1 43.95750  9.436646 15817  4.65817       0
##  Correlation: 
##                  X(Int)
## Xs(predictor)Fx1 0.004 
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -6.25281763 -0.49720339  0.07639008  0.60348222  7.06847413 
## 
## Number of Observations: 16107
## Number of Groups: 
##         g id %in% g 
##         1       289
summary(smoothing_spline$gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## value ~ s(predictor, bs = "ps")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  43.4513     0.7364      59   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df     F p-value    
## s(predictor) 8.313  8.313 597.8  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.186   
##   Scale est. = 58.896    n = 16107
REM_linear = lmer(value ~ (1|id) + predictor, data)
equ_linear = function(t) fixef(REM_linear)[1] + fixef(REM_linear)[2]*t
summary(REM_linear)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor
##    Data: data
## 
## REML criterion at convergence: 115031.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.1979 -0.4926  0.1022  0.6188  5.6972 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 158.62   12.595  
##  Residual              68.26    8.262  
## Number of obs: 16107, groups:  id, 289
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) 37.754957   0.747868   50.48
## predictor    0.065839   0.001433   45.95
## 
## Correlation of Fixed Effects:
##           (Intr)
## predictor -0.053
REM_quadratic = lmer(value ~ (1|id) + predictor + I(predictor^2), data)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
equ_quadratic = function(t) fixef(REM_quadratic)[1] + fixef(REM_quadratic)[2]*t + fixef(REM_quadratic)[3]*t^2
summary(REM_quadratic)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor + I(predictor^2)
##    Data: data
## 
## REML criterion at convergence: 114228.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.1921 -0.5035  0.0821  0.6175  6.2998 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 154.86   12.444  
##  Residual              64.83    8.052  
## Number of obs: 16107, groups:  id, 289
## 
## Fixed effects:
##                  Estimate Std. Error t value
## (Intercept)     3.612e+01  7.409e-01   48.76
## predictor       1.549e-01  3.366e-03   46.02
## I(predictor^2) -3.592e-04  1.235e-05  -29.09
## 
## Correlation of Fixed Effects:
##             (Intr) prdctr
## predictor   -0.090       
## I(prdctr^2)  0.076 -0.910
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
REM_bounded = nlmer(value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0|id) + (yf|id), data = data, start=c(yf=40, y0=20, log_alpha=-1))
y0=fixef(REM_bounded)["y0"]
yf=fixef(REM_bounded)["yf"]
log_alpha=fixef(REM_bounded)["log_alpha"]
equ_bounded = function(t, yf=fixef(REM_bounded)[["yf"]], y0=fixef(REM_bounded)[["y0"]], log_alpha=fixef(REM_bounded)[["log_alpha"]]) yf+(y0-yf)*exp(-exp(log_alpha)*t)
summary(REM_bounded)
## Warning in vcov.merMod(object, use.hessian = use.hessian): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Warning in vcov.merMod(object, correlation = correlation, sigm = sig): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Nonlinear mixed model fit by maximum likelihood  ['nlmerMod']
## Formula: value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0 | id) + (yf |  
##     id)
##    Data: data
## 
##      AIC      BIC   logLik deviance df.resid 
## 110214.9 110261.1 -55101.5 110202.9    16101 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.9476 -0.4878  0.0788  0.6072  5.4809 
## 
## Random effects:
##  Groups   Name Variance Std.Dev.
##  id       y0   174.15   13.197  
##  id.1     yf   524.55   22.903  
##  Residual       48.07    6.933  
## Number of obs: 16107, groups:  id, 289
## 
## Fixed effects:
##           Estimate Std. Error t value
## yf        55.88899    1.52873   36.56
## y0        31.75416    0.79357   40.01
## log_alpha -3.69809    0.03111 -118.89
## 
## Correlation of Fixed Effects:
##           yf     y0    
## y0        -0.026       
## log_alpha -0.291 -0.071
exp(log_alpha)
## log_alpha 
## 0.0247707
cat("Average improvement over baseline: ", (yf-y0)/y0*100)
## Average improvement over baseline:  76.00525
RMSE = sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) sqrt(mean(resid(mod)^2))) # RMSE
RMSE
## [1] 8.188627 7.979710 7.604297 6.819560
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) mean(abs(resid(mod)))) # MAE
## [1] 6.081292 5.913759 5.598031 5.041147
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) extractAIC(mod)) # edf & AIC: smoothing_spline$lme always has edf 5
##          [,1]     [,2]     [,3]     [,4]
## [1,]      4.0      5.0      5.0      6.0
## [2,] 115029.8 114207.2 112739.3 110214.9
edf = sapply(list(REM_linear, REM_quadratic, smoothing_spline$gam, REM_bounded), function(mod) { nrow(data)-df.residual(mod) }) # while smoothing_spline$gam often has much higher edf
edf
## [1] 4.00000 5.00000 9.31268 6.00000
RMSE = round(RMSE,1)
edf = round(edf,1)

Plot

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(predictor), .groups="keep") %>% deframe()
remaining_participants_bins = seq(0,350,by=10)
remaining_participants = data.frame(x=remaining_participants_bins, text=sapply(remaining_participants_bins, function(x) sum(participation_duration>=x)))

xmax = remaining_participants$x[which(remaining_participants$text<10)[1]]

range_90p =quantile(data$value, probs=c(0.05,0.95))

p1 = ggplot() +
  geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.1) +
  geom_line(aes(x,y), data.frame(x=0:xmax, y=predict(smoothing_spline$gam, newdata=data.frame(predictor=0:xmax))), linetype="longdash", size=1) + xlim(0,xmax) + ylim(range_90p[1], range_90p[2]) +
  stat_function(fun=equ_linear, color="blue", linetype="dotted", size=1) + 
  stat_function(fun=equ_quadratic, color="green4", linetype="dashed", size=1) + 
  stat_function(fun=equ_bounded, color=linecolor, size=1) + 
  theme_pubr(base_family="Serif") + no_x + xlab(NULL) + ylab(params$unit) +
  geom_richtext(aes(x,y,label=label,hjust=1), data.frame(x=0.8*xmax, y=range_90p[2]-(range_90p[2]-range_90p[1])/1.3, label=paste0("Model RMSE (edf):<br><span style='color:#0000ff'>····· Linear: ", RMSE[1], " (", edf[1], ") </span><br><span style='color:#008b00'>- - - Quadratic: ", RMSE[2], " (", edf[2], ") </span><br><span style='color:#000000'>— — Smoothing spline: ", RMSE[3], " (", edf[3], ") </span><br><span style='color:", linecolor, "'>— Bounded growth: ", RMSE[4], " (", edf[4], ") </span>")), family="Serif")

p2 = ggplot(remaining_participants[remaining_participants$x<=xmax,]) +
  geom_text(aes(x=x,y="A",label=text), family="Serif") +
  theme_pubr(base_family="Serif") + no_y + xlab(params$xlab) + ylab(paste0("Remaining \n ", params$unit_n, "s")) +
  scale_x_continuous(breaks=seq(0,xmax,by=10))

(p1 / p2) + plot_layout(heights=c(0.9,0.1))
## Warning: Removed 900 row(s) containing missing values (geom_path).

if (params$bounded.growth.confidence.interval) conf = confint.merMod(REM_bounded, c("y0","yf","log_alpha"), method="profile")
if (params$bounded.growth.confidence.interval) {
  print(conf)
  print(exp(conf[3,]))
  print(paste0("Average boundary improvement over baseline: ", round((yf-y0)/y0*100, 1), " (", round((conf[1,1]-conf[2,1])/conf[2,1]*100, 1), "-", round((conf[1,2]-conf[2,2])/conf[2,2]*100, 1), ")"))
}

Bounded growth model

equ_diff_REM_bounded = function(t) exp(log_alpha)*(yf-y0)*exp(exp(log_alpha)*-t)
equ_diff_get_time_REM_bounded = function(target_slope) log(exp(log_alpha)*(yf-y0)/target_slope)/exp(log_alpha)

equ_bounded_get_x = function(target_value) exp(-log_alpha)*log((yf-y0)/(yf-target_value))

growth_percentiles = c(0, 0.5, 0.9)
names_percentiles = c("baseline", "half-practice point", "90% practice")
selected_timepoints = equ_bounded_get_x(y0+(yf-y0)*growth_percentiles)
example_slopes_bounded = data.frame(
  x=selected_timepoints,
  y=equ_bounded(selected_timepoints),
  label=paste0("y=", round(equ_bounded(selected_timepoints),1), ", m=", signif(equ_diff_REM_bounded(selected_timepoints),2), " at ", params$unit_time, " ", round(selected_timepoints,0), ", ", names_percentiles),
  vjust=1.5
)
example_slopes_bounded = rbind(example_slopes_bounded, list(x=0.83*xmax, y=yf, label=paste0("boundary: ", round(yf, 1)), vjust=-1.0))

if (params$bounded.growth.confidence.interval) ribbon = data.frame(x=seq(0,xmax,0.05), ymin=equ_bounded(seq(0,xmax,0.05), conf["yf","2.5 %"], conf["y0","2.5 %"], conf["log_alpha","2.5 %"]), ymax=equ_bounded(seq(0,xmax,0.05), conf["yf","97.5 %"], conf["y0","97.5 %"], conf["log_alpha","97.5 %"]))

quant = quantile(table(data$id))
print(paste0("n tests = ", nrow(data), " (n ", params$unit_n, "s = ", length(unique(data$id)), ", median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 16107 (n hands = 289, median tests per hand: 35, IQR 18-59)"
p1 = ggplot() + geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.2) +
  theme_pubr(base_family="Serif") + scale_x_continuous(limits = c(0,xmax), expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + xlab(params$xlab) + ylab(params$unit) +
  geom_vline(xintercept=example_slopes_bounded[2,"x"], color=linecolor, linetype=2) +
  stat_function(fun=equ_bounded, color=linecolor, size=1) +
  geom_point(data=example_slopes_bounded[1:(nrow(example_slopes_bounded)-1),], aes(x,y), color=linecolor, size=5) +
  geom_text(data=example_slopes_bounded, aes(x,y,label=label, vjust=vjust), color=linecolor, hjust=-0.01, family="Serif")

if (params$bounded.growth.confidence.interval) p1 = p1 + geom_ribbon(aes(x=x, ymin=ymin, ymax=ymax), ribbon, fill=linecolor, alpha=0.3)

p1
## Warning: Removed 480 row(s) containing missing values (geom_path).

Quantile regression

if (is.null(params$censor_after)) {
  censor_after = as.integer(round(selected_timepoints[2])) # half-practice point
} else {
  censor_after = params$censor_after
}
data_censored = data[data$predictor <= censor_after,]

percentiles = c(0.05,0.25,0.5,0.75,0.95)

QR = rq(value ~ predictor, tau=percentiles, data_censored)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
p_vals = sapply(1:length(summary(QR)), function(i) {
  summ = coef(summary(QR, se="ker")[[i]])
  print(summ)
  print(paste0("Intercept: ", round(summ[1,1],1), " (", round(summ[1,1]-1.96*summ[1,2],1), "-", round(summ[1,1]+1.96*summ[1,2],1), "), beta: ", round(summ[2,1],2), " (", round(summ[2,1]-1.96*summ[2,2],2), "-", round(summ[2,1]+1.96*summ[2,2],2), ")"))
  summ[2,4]
})
##                 Value Std. Error   t value     Pr(>|t|)
## (Intercept) 6.6666667 0.58572721 11.381863 0.000000e+00
## predictor   0.3333333 0.04085931  8.158075 4.440892e-16
## [1] "Intercept: 6.7 (5.5-7.8), beta: 0.33 (0.25-0.41)"
##                  Value Std. Error  t value Pr(>|t|)
## (Intercept) 20.9230769 0.44392951 47.13153        0
## predictor    0.5384615 0.03203369 16.80923        0
## [1] "Intercept: 20.9 (20.1-21.8), beta: 0.54 (0.48-0.60)"
##                  Value Std. Error  t value Pr(>|t|)
## (Intercept) 32.0000000 0.46686831 68.54181        0
## predictor    0.5789474 0.02892239 20.01727        0
## [1] "Intercept: 32.0 (31.1-32.9), beta: 0.58 (0.52-0.64)"
##                  Value Std. Error   t value Pr(>|t|)
## (Intercept) 43.5789474 0.42595392 102.30907        0
## predictor    0.4210526 0.02638668  15.95701        0
## [1] "Intercept: 43.6 (42.7-44.4), beta: 0.42 (0.37-0.47)"
##                  Value Std. Error    t value Pr(>|t|)
## (Intercept) 55.0909091 0.51897633 106.153028        0
## predictor    0.3181818 0.03430807   9.274255        0
## [1] "Intercept: 55.1 (54.1-56.1), beta: 0.32 (0.25-0.39)"
p_vals = p.adjust(p_vals, method="bonferroni")

ANOVA = anova(QR)

quant = quantile(table(data_censored$id))
print(paste0("n tests = ", nrow(data_censored), " (median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 6945 (median tests per hand: 29, IQR 18-29)"
signif_p = function(x, digits=1) {
  x = signif(x, digits)
  if (as.character(x) == "0") return("< 2e-16")
  else return(paste0("= ", x))
}

ggplot() + geom_line(aes_string("predictor", "value", group="id"), data_censored, alpha=0.2, color="darkgrey") + theme_pubr(base_family="Serif") + scale_x_continuous(expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + theme(legend.position = "none") + xlab(params$xlab) + ylab(params$unit) +
  geom_abline(intercept=coef(QR)[1,], slope=coef(QR)[2,], color=linecolor) +
  geom_text(data=data.frame(intercept=coef(QR)[1,], label=paste0(percentiles*100, "th percentile: β = ", round(coef(QR)[2,],1), ", ", "p.adj ", sapply(p_vals,signif_p))),
            mapping=aes(x=1,y=intercept, label=label), color=linecolor, hjust="left", vjust=1, family="Serif") +
  coord_cartesian(xlim=c(0,censor_after)) +
  geom_text(aes(x=x,y=y,label=label), data.frame(x=0.8*censor_after, y=0, label=paste0("ANOVA p ", signif_p(ANOVA$table$pvalue, 1))), vjust=-1.5, family="Serif") +
  geom_vline(xintercept=censor_after, color=linecolor, linetype=2)

Mobility

params = list(
  test_code = "two_min_walk",
  test_metric_code = "steps",
  unit = "Two Minute Walk: Steps",
  unit_n = "patient",
  unit_time = "repetition",
  min_repetitions = 10,
  min_weeks = 10,
  predictor = "repetition",
  xlab = "Repetitions",
  bounded.growth.confidence.interval = F,
  censor_after = 10, # allow comparison with cognition
  up_to_date = "2021-05-01"
)
library(data.table) # fread
library(parsedate) # parse_date
library(dplyr) # group_by
library(tibble) # deframe
library(lme4) # lmer
library(mgcv) # gamm
library(quantreg) # rq
library(patchwork) # plot_layout
library(gridExtra) # grid.arrange
library(ggpubr) # ggscatter
library(ggtext) # geom_text
library(sjPlot) # plot_model

# Download from: https://dataset.floodlightopen.com/public-blobs-prod/complete_dataset.csv
data = fread("complete_dataset.csv", data.table=F)

# Prepare dataset
data = data[data$testCode == params$test_code & !data$participantIsControl,]
data$time = parse_date(data$testStartedAt)
data = data[data$time <= as.POSIXct(params$up_to_date, tz="UTC"),] # only analyse data up to (excluding) params$up_to_date
data = data[!duplicated(data),] # sometimes contains true duplicates for some reason (even with the same testResultMetricId)

# For "Finger Pinching" hand_used has to be determined
if (params$test_code == "pinching") {
  library(tidyr) # pivot_wider
  # just one means either "hand" or "successful_pinches" values are missing, remove those
  table(table(data$time))
  data = data[!data$time %in% names(which(table(data$time)==1)), ]
  data = as.data.frame(pivot_wider(data, id_cols=c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time"), names_from="testMetricCode", values_from="testResultMetricValue"))
} else {
  data = data[data$testMetricCode == params$test_metric_code,]
  data$hand_used = NA
  data = data[c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time", "testResultMetricValue", "hand_used")]
}

colnames(data) = c("id", "control", "sex", "birthyear", "time", "value", "hand_used")
data$age = year(data$time)-data$birthyear # Estimate age
data = data[order(as.character(data$id)),]

# 0 result values are discarded
data = data[!is.na(data$value) & data$value != 0,]

# Consider those supposedly younger than 18 (minimum study age) and older than 90 as NA
data$age[data$age < 18 | data$age > 90] = NA

data$id_original = data$id
data$id = paste0(data$id, "_hand", data$hand_used)

data$day = as.IDate(data$time)
round = function(x, digits=0) sprintf(paste0("%.", digits, "f"), x)

no_x = theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
no_y = theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())

linecolor = "#c71138"

Participant selection

# At least x weeks & repetitions
for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "repetition"] = (1:n)-1
  data[subset, "weeksSinceFirst"] = as.numeric(difftime(data[subset, "time"], data[subset, "time"][1], unit="weeks"))
}

n_orig = nrow(data)
n_patients_orig = length(unique(data$id_original))
n_hands_orig = length(unique(data$id))

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(weeksSinceFirst), repetitions=last(repetition), .groups="keep")

Among the total n=540 patients with n=14031 repetitions, the median length of participation is 0.9 weeks (IQR 0.0-11.2, range 0.0-133.5) and the median number of repetitions is 3 (IQR 1-14.25, range 1-735).

data = data[data$id %in% participation_duration$id[participation_duration$weeks >= params$min_weeks & participation_duration$repetitions+1 >= params$min_repetitions],]

for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "daysSinceLast"] = as.numeric(difftime(data[subset, "time"], c(data[subset, "time"][1], data[subset, "time"][1:(n-1)]), unit="days"))
}

participation_duration = data %>% group_by(id) %>% summarise(sex=first(sex), mean_age=mean(age), weeks=last(weeksSinceFirst), repetitions=last(repetition), median_intertest_interval=median(daysSinceLast), IQR_intertest_interval=IQR(daysSinceLast), .groups="keep")

data$predictor = data[,params$predictor]

Inclusion criteria: participation for at least 10 weeks and at least 10 repetitions performed per test, leading to the analysis of n=118 / 540 patients and n=12253 / 14031 tests. Among those, the median length of participation is 27.9 weeks (IQR 15.3-55.8, range 10.3-133.5) and the median number of repetitions is 61.5 (IQR 23.5-108.75, range 10-735).

t(data.frame(
  n_patients = paste0(length(unique(data$id_original)), " / ", n_patients_orig, " (", round(length(unique(data$id_original))/n_patients_orig*100,1), "%)"),
  n_hands = paste0(length(unique(data$id)), " / ", n_hands_orig, " (", round(length(unique(data$id))/n_hands_orig*100,1), "%)"),
  n_tests = paste0(nrow(data), " / ", n_orig, " (", round(nrow(data)/n_orig*100,1), "%)"),
  percent_female = paste0(round(prop.table(table(participation_duration$sex == "female"))[[2]]*100, 1)),
  age = paste0(round(median(participation_duration$mean_age,na.rm=T),1), " (", round(quantile(participation_duration$mean_age, 0.25, na.rm=T),1), "-", round(quantile(participation_duration$mean_age, 0.75, na.rm=T),1), ", range ", round(min(participation_duration$mean_age, na.rm=T),1), "-", round(max(participation_duration$mean_age, na.rm=T),1), ")"),
  repetitions = paste0(median(participation_duration$repetitions)+1, " repetitions (IQR ", quantile(participation_duration$repetitions+1, 0.25), "-", quantile(participation_duration$repetitions+1, 0.75), ", range ", min(participation_duration$repetitions+1), "-", max(participation_duration$repetitions+1), ")"),
  median_intertest_interval = paste0(round(median(participation_duration$median_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$median_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$median_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$median_intertest_interval),1), "-", round(max(participation_duration$median_intertest_interval),1), ")"),
  IQR_intertest_interval = paste0(round(median(participation_duration$IQR_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$IQR_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$IQR_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$IQR_intertest_interval),1), "-", round(max(participation_duration$IQR_intertest_interval),1), ")"),
  weeks = paste0(round(median(participation_duration$weeks),1), " weeks (IQR ", round(quantile(participation_duration$weeks, 0.25),1), "-", round(quantile(participation_duration$weeks, 0.75),1), ", range ", round(min(participation_duration$weeks),1), "-", round(max(participation_duration$weeks),1), ")")
))
##                           [,1]                                              
## n_patients                "118 / 540 (21.9%)"                               
## n_hands                   "118 / 540 (21.9%)"                               
## n_tests                   "12253 / 14031 (87.3%)"                           
## percent_female            "69.5"                                            
## age                       "50.2 (43.0-58.0, range 25.0-74.3)"               
## repetitions               "61.5 repetitions (IQR 23.5-108.75, range 10-735)"
## median_intertest_interval "1.1 days (IQR 1.0-2.1, range 1.0-24.9)"          
## IQR_intertest_interval    "1.6 days (IQR 0.4-4.0, range 0.1-32.0)"          
## weeks                     "27.9 weeks (IQR 15.3-55.8, range 10.3-133.5)"

Summary level analysis

Difference test

df = as.data.frame(data %>% group_by(id) %>% summarise(first=first(value), last=last(value), mean=mean(value), weeksSinceFirst=max(weeksSinceFirst), repetition=n(), first_age=first(age), last_age=last(age), mean_age=mean(age), .groups="keep") %>% mutate(diff=last-first))

df$predictor = df[, params$predictor]

test = t.test(df$last, df$first, paired=T)
test
## 
##  Paired t-test
## 
## data:  df$last and df$first
## t = 0.75247, df = 117, p-value = 0.4533
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.407502 12.034621
## sample estimates:
## mean of the differences 
##                3.313559
mod0 = lm(diff ~ I(mean_age/10) + I(first/10), df)
summ0 = summary(mod0)
summ0
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -138.104  -18.099    6.796   25.089  124.618 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     64.0343    31.4890   2.034 0.044320 *  
## I(mean_age/10)   2.7008     4.2556   0.635 0.526936    
## I(first/10)     -3.5897     0.8972  -4.001 0.000112 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.97 on 114 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1388, Adjusted R-squared:  0.1236 
## F-statistic: 9.183 on 2 and 114 DF,  p-value: 0.0002006
mod = lm(diff ~ I(mean_age/10) + I(first/10) + log10(predictor), df)
confint(mod)
##                       2.5 %     97.5 %
## (Intercept)       -3.677925 124.089620
## I(mean_age/10)    -6.848258  10.776971
## I(first/10)       -5.510887  -1.877086
## log10(predictor) -13.040585  24.033930
summ = summary(mod)
summ
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10) + log10(predictor), 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -138.075  -16.965    7.372   24.480  119.160 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       60.2058    32.2453   1.867 0.064477 .  
## I(mean_age/10)     1.9644     4.4482   0.442 0.659615    
## I(first/10)       -3.6940     0.9171  -4.028 0.000102 ***
## log10(predictor)   5.4967     9.3567   0.587 0.558067    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.1 on 113 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1414, Adjusted R-squared:  0.1186 
## F-statistic: 6.202 on 3 and 113 DF,  p-value: 0.0006135
# additional variance explained by predictor
#print(summ$r.squared - summ0$r.squared)

print(paste0("Average observed improvement over baseline: ", round(test$estimate/mean(df$first)*100, 1), " (", round(test$conf[1]/mean(df$first)*100, 1), "-", round(test$conf[2]/mean(df$first)*100, 1), ")"))
## [1] "Average observed improvement over baseline: 1.6 (-2.6-5.8)"
lab.y = 1.1*mean(df$last)

p1 = ggbarplot(data.frame(Timepoint=rep(c("First","Mean","Last"),each=nrow(df)), value=c(df$first,df$mean,df$last)), "Timepoint", "value", add="mean_se", label=T, lab.nb.digits=1, lab.vjust=1.9, ylab=params$unit) + xlab("Score") #+ stat_compare_means(comparisons = list(c("First","Last")), paired=T, method="t.test", label.y=lab.y) + scale_y_continuous(expand=expansion(mult=c(0,0.1)))

p2 = plot_model(summ, show.values=T, vline.color = "grey", show.intercept=T, colors=linecolor, title=paste0("Difference from First to Last Score, R²=", round(summ$r.squared, 2)), axis.labels=rev(c("Intercept", "Age (per 10 years)", "First score (per 10)", paste0(params$xlab, " (log 10)"))), value.offset=0.3, show.p=F) + ylab("β estimates")

(p1 + p2) + plot_layout(widths=c(2,5)) & theme_pubr(base_family="Serif")

Confounders

p_age_first = ggscatter(df, "mean_age", "first", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("First score") + theme_pubr(base_family="Serif")

p_age_pred = ggscatter(df, "mean_age", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Mean age") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_age_last = ggscatter(df, "mean_age", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Last score") + theme_pubr(base_family="Serif")

p_age_diff = ggscatter(df, "mean_age", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_first_pred = ggscatter(df, "first", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("First score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_first_last = ggscatter(df, "first", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_first_diff = ggscatter(df, "first", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_pred_last = ggscatter(df, "predictor", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Last score") + theme_pubr(base_family="Serif")

p_pred_diff = ggscatter(df, "predictor", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_last_diff = ggscatter(df, "last", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Last score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_last_pred = ggscatter(df, "last", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Last score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")


p_age = gghistogram(df, "mean_age", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_first = gghistogram(df, "first", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_last = gghistogram(df, "last", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_pred = gghistogram(df, "predictor", bins=15) + scale_x_log10() + xlab(NULL) + theme_pubr(base_family="Serif")
p_diff = gghistogram(df, "diff", bins=15) + xlab("Difference first to last") + theme_pubr(base_family="Serif")

#(((p1+xlab(NULL)) + (p2+xlab(NULL)+ylab(NULL))) / ((p3+xlab(NULL)) + (p4+xlab(NULL)+ylab(NULL))) / ((p5) | (p6+ylab(NULL)))) & theme_pubr(base_family="Serif")

#(p_age_first | p_first) / (p_age_last | p_first_last | p_last) / (p_age_pred | p_first_pred | p_last_pred | p_pred) / (p_age_diff | p_first_diff | p_last_diff | p_pred_diff)

m <- matrix(NA, 5, 5)
m[lower.tri(m, diag = T)] <- 1:15
grid.arrange(grobs=list(
  p_age, p_age_first+xlab(NULL), p_age_pred+xlab(NULL), p_age_last+xlab(NULL), p_age_diff,
  p_first, p_first_pred+xlab(NULL)+ylab(""), p_first_last+xlab(NULL)+ylab(""), p_first_diff+ylab(""),
  p_pred, p_pred_last+xlab(NULL)+ylab(""), p_pred_diff+ylab(""),
  p_last, p_last_diff+ylab(""), 
  p_diff
), layout_matrix=m, heights=c(1,1,1,1,1.1))
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#GGally::ggpairs(df[c("mean_age", "first", "last", "predictor", "diff")])
#pairs(df[c("mean_age", "first", "last", "predictor", "diff")], upper.panel=NULL)
#corrplot::corrplot(cor(df[c("mean_age", "first", "last", "predictor", "diff")], use="complete.obs"))

Learning curve: Model selection

smoothing_spline = gamm(value ~ s(predictor, bs="ps"), random=list(id=~1), data=data)
summary(smoothing_spline$lme)
## Linear mixed-effects model fit by maximum likelihood
##  Data: strip.offset(mf) 
##      AIC      BIC    logLik
##   114671 114708.1 -57330.52
## 
## Random effects:
##  Formula: ~Xr - 1 | g
##  Structure: pdIdnot
##              Xr1      Xr2      Xr3      Xr4      Xr5      Xr6      Xr7      Xr8
## StdDev: 3.419627 3.419627 3.419627 3.419627 3.419627 3.419627 3.419627 3.419627
## 
##  Formula: ~1 | id %in% g
##         (Intercept) Residual
## StdDev:    42.25887  25.3961
## 
## Fixed effects: y ~ X - 1 
##                      Value Std.Error    DF  t-value p-value
## X(Intercept)     211.45541  3.919116 12134 53.95487  0.0000
## Xs(predictor)Fx1 -15.28688  9.571125 12134 -1.59719  0.1102
##  Correlation: 
##                  X(Int)
## Xs(predictor)Fx1 0.011 
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -11.4589971  -0.2757026   0.0927828   0.4246101   6.7845851 
## 
## Number of Observations: 12253
## Number of Groups: 
##         g id %in% g 
##         1       118
summary(smoothing_spline$gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## value ~ s(predictor, bs = "ps")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  211.455      3.919   53.96   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df     F  p-value    
## s(predictor) 6.122  6.122 5.199 6.75e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0077   
##   Scale est. = 644.96    n = 12253
REM_linear = lmer(value ~ (1|id) + predictor, data)
equ_linear = function(t) fixef(REM_linear)[1] + fixef(REM_linear)[2]*t
summary(REM_linear)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor
##    Data: data
## 
## REML criterion at convergence: 114677.3
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -11.4474  -0.2771   0.0938   0.4285   6.7270 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 1810.6   42.55   
##  Residual              646.7   25.43   
## Number of obs: 12253, groups:  id, 118
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept) 211.403029   3.938880  53.671
## predictor    -0.002793   0.002511  -1.112
## 
## Correlation of Fixed Effects:
##           (Intr)
## predictor -0.033
REM_quadratic = lmer(value ~ (1|id) + predictor + I(predictor^2), data)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
equ_quadratic = function(t) fixef(REM_quadratic)[1] + fixef(REM_quadratic)[2]*t + fixef(REM_quadratic)[3]*t^2
summary(REM_quadratic)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor + I(predictor^2)
##    Data: data
## 
## REML criterion at convergence: 114691.1
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -11.4501  -0.2788   0.0952   0.4277   6.7587 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 1803.0   42.46   
##  Residual              646.3   25.42   
## Number of obs: 12253, groups:  id, 118
## 
## Fixed effects:
##                  Estimate Std. Error t value
## (Intercept)     2.109e+02  3.935e+00  53.606
## predictor       1.150e-02  5.853e-03   1.965
## I(predictor^2) -2.923e-05  1.081e-05  -2.704
## 
## Correlation of Fixed Effects:
##             (Intr) prdctr
## predictor   -0.055       
## I(prdctr^2)  0.045 -0.903
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
REM_bounded = nlmer(value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0|id) + (yf|id), data = data, start=c(yf=40, y0=20, log_alpha=-1))
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
y0=fixef(REM_bounded)["y0"]
yf=fixef(REM_bounded)["yf"]
log_alpha=fixef(REM_bounded)["log_alpha"]
equ_bounded = function(t, yf=fixef(REM_bounded)[["yf"]], y0=fixef(REM_bounded)[["y0"]], log_alpha=fixef(REM_bounded)[["log_alpha"]]) yf+(y0-yf)*exp(-exp(log_alpha)*t)
summary(REM_bounded)
## Warning in vcov.merMod(object, use.hessian = use.hessian): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Warning in vcov.merMod(object, correlation = correlation, sigm = sig): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Nonlinear mixed model fit by maximum likelihood  ['nlmerMod']
## Formula: value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0 | id) + (yf |  
##     id)
##    Data: data
## 
##      AIC      BIC   logLik deviance df.resid 
## 113385.7 113430.2 -56686.8 113373.7    12247 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -12.2697  -0.2843   0.0759   0.4095   5.2328 
## 
## Random effects:
##  Groups   Name Variance Std.Dev.
##  id       y0   2096     45.79   
##  id.1     yf   2925     54.08   
##  Residual       567     23.81   
## Number of obs: 12253, groups:  id, 118
## 
## Fixed effects:
##            Estimate Std. Error t value
## yf        210.32786    5.51626   38.13
## y0        211.90878    4.27318   49.59
## log_alpha  -3.87292    0.06559  -59.05
## 
## Correlation of Fixed Effects:
##           yf     y0    
## y0        -0.042       
## log_alpha  0.024 -0.003
## convergence code: 0
## failure to converge in 10000 evaluations
exp(log_alpha)
##  log_alpha 
## 0.02079763
cat("Average improvement over baseline: ", (yf-y0)/y0*100)
## Average improvement over baseline:  -0.7460363
RMSE = sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) sqrt(mean(resid(mod)^2))) # RMSE
RMSE
## [1] 25.30676 25.29966 25.26893 23.60502
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) mean(abs(resid(mod)))) # MAE
## [1] 15.62330 15.62261 15.58052 14.45061
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) extractAIC(mod)) # edf & AIC: smoothing_spline$lme always has edf 5
##          [,1]     [,2]   [,3]     [,4]
## [1,]      4.0      5.0      5      6.0
## [2,] 114679.8 114674.5 114671 113385.7
edf = sapply(list(REM_linear, REM_quadratic, smoothing_spline$gam, REM_bounded), function(mod) { nrow(data)-df.residual(mod) }) # while smoothing_spline$gam often has much higher edf
edf
## [1] 4.000000 5.000000 7.122494 6.000000
RMSE = round(RMSE,1)
edf = round(edf,1)

Plot

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(predictor), .groups="keep") %>% deframe()
remaining_participants_bins = seq(0,350,by=10)
remaining_participants = data.frame(x=remaining_participants_bins, text=sapply(remaining_participants_bins, function(x) sum(participation_duration>=x)))

xmax = remaining_participants$x[which(remaining_participants$text<10)[1]]

range_90p =quantile(data$value, probs=c(0.05,0.95))

p1 = ggplot() +
  geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.1) +
  geom_line(aes(x,y), data.frame(x=0:xmax, y=predict(smoothing_spline$gam, newdata=data.frame(predictor=0:xmax))), linetype="longdash", size=1) + xlim(0,xmax) + ylim(range_90p[1], range_90p[2]) +
  stat_function(fun=equ_linear, color="blue", linetype="dotted", size=1) + 
  stat_function(fun=equ_quadratic, color="green4", linetype="dashed", size=1) + 
  stat_function(fun=equ_bounded, color=linecolor, size=1) + 
  theme_pubr(base_family="Serif") + no_x + xlab(NULL) + ylab(params$unit) +
  geom_richtext(aes(x,y,label=label,hjust=1), data.frame(x=0.8*xmax, y=range_90p[2]-(range_90p[2]-range_90p[1])/1.3, label=paste0("Model RMSE (edf):<br><span style='color:#0000ff'>····· Linear: ", RMSE[1], " (", edf[1], ") </span><br><span style='color:#008b00'>- - - Quadratic: ", RMSE[2], " (", edf[2], ") </span><br><span style='color:#000000'>— — Smoothing spline: ", RMSE[3], " (", edf[3], ") </span><br><span style='color:", linecolor, "'>— Bounded growth: ", RMSE[4], " (", edf[4], ") </span>")), family="Serif")

p2 = ggplot(remaining_participants[remaining_participants$x<=xmax,]) +
  geom_text(aes(x=x,y="A",label=text), family="Serif") +
  theme_pubr(base_family="Serif") + no_y + xlab(params$xlab) + ylab(paste0("Remaining \n ", params$unit_n, "s")) +
  scale_x_continuous(breaks=seq(0,xmax,by=10))

(p1 / p2) + plot_layout(heights=c(0.9,0.1))
## Warning: Removed 1078 row(s) containing missing values (geom_path).

if (params$bounded.growth.confidence.interval) conf = confint.merMod(REM_bounded, c("y0","yf","log_alpha"), method="profile")
if (params$bounded.growth.confidence.interval) {
  print(conf)
  print(exp(conf[3,]))
  print(paste0("Average boundary improvement over baseline: ", round((yf-y0)/y0*100, 1), " (", round((conf[1,1]-conf[2,1])/conf[2,1]*100, 1), "-", round((conf[1,2]-conf[2,2])/conf[2,2]*100, 1), ")"))
}

Bounded growth model

equ_diff_REM_bounded = function(t) exp(log_alpha)*(yf-y0)*exp(exp(log_alpha)*-t)
equ_diff_get_time_REM_bounded = function(target_slope) log(exp(log_alpha)*(yf-y0)/target_slope)/exp(log_alpha)

equ_bounded_get_x = function(target_value) exp(-log_alpha)*log((yf-y0)/(yf-target_value))

growth_percentiles = c(0, 0.5, 0.9)
names_percentiles = c("baseline", "half-practice point", "90% practice")
selected_timepoints = equ_bounded_get_x(y0+(yf-y0)*growth_percentiles)
example_slopes_bounded = data.frame(
  x=selected_timepoints,
  y=equ_bounded(selected_timepoints),
  label=paste0("y=", round(equ_bounded(selected_timepoints),1), ", m=", signif(equ_diff_REM_bounded(selected_timepoints),2), " at ", params$unit_time, " ", round(selected_timepoints,0), ", ", names_percentiles),
  vjust=1.5
)
example_slopes_bounded = rbind(example_slopes_bounded, list(x=0.83*xmax, y=yf, label=paste0("boundary: ", round(yf, 1)), vjust=-1.0))

if (params$bounded.growth.confidence.interval) ribbon = data.frame(x=seq(0,xmax,0.05), ymin=equ_bounded(seq(0,xmax,0.05), conf["yf","2.5 %"], conf["y0","2.5 %"], conf["log_alpha","2.5 %"]), ymax=equ_bounded(seq(0,xmax,0.05), conf["yf","97.5 %"], conf["y0","97.5 %"], conf["log_alpha","97.5 %"]))

quant = quantile(table(data$id))
print(paste0("n tests = ", nrow(data), " (n ", params$unit_n, "s = ", length(unique(data$id)), ", median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 12253 (n patients = 118, median tests per patient: 61.5, IQR 23.5-108.75)"
p1 = ggplot() + geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.2) +
  theme_pubr(base_family="Serif") + scale_x_continuous(limits = c(0,xmax), expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + xlab(params$xlab) + ylab(params$unit) +
  geom_vline(xintercept=example_slopes_bounded[2,"x"], color=linecolor, linetype=2) +
  stat_function(fun=equ_bounded, color=linecolor, size=1) +
  geom_point(data=example_slopes_bounded[1:(nrow(example_slopes_bounded)-1),], aes(x,y), color=linecolor, size=5) +
  geom_text(data=example_slopes_bounded, aes(x,y,label=label, vjust=vjust), color=linecolor, hjust=-0.01, family="Serif")

if (params$bounded.growth.confidence.interval) p1 = p1 + geom_ribbon(aes(x=x, ymin=ymin, ymax=ymax), ribbon, fill=linecolor, alpha=0.3)

p1
## Warning: Removed 852 row(s) containing missing values (geom_path).

Quantile regression

if (is.null(params$censor_after)) {
  censor_after = as.integer(round(selected_timepoints[2])) # half-practice point
} else {
  censor_after = params$censor_after
}
data_censored = data[data$predictor <= censor_after,]

percentiles = c(0.05,0.25,0.5,0.75,0.95)

QR = rq(value ~ predictor, tau=percentiles, data_censored)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
p_vals = sapply(1:length(summary(QR)), function(i) {
  summ = coef(summary(QR, se="ker")[[i]])
  print(summ)
  print(paste0("Intercept: ", round(summ[1,1],1), " (", round(summ[1,1]-1.96*summ[1,2],1), "-", round(summ[1,1]+1.96*summ[1,2],1), "), beta: ", round(summ[2,1],2), " (", round(summ[2,1]-1.96*summ[2,2],2), "-", round(summ[2,1]+1.96*summ[2,2],2), ")"))
  summ[2,4]
})
##             Value Std. Error    t value  Pr(>|t|)
## (Intercept)   109   9.320741 11.6943494 0.0000000
## predictor       1   1.604296  0.6233265 0.5331799
## [1] "Intercept: 109.0 (90.7-127.3), beta: 1.00 (-2.14-4.14)"
##             Value Std. Error    t value Pr(>|t|)
## (Intercept) 192.0  4.0608175 47.2811205 0.000000
## predictor     0.4  0.6500047  0.6153802 0.538412
## [1] "Intercept: 192.0 (184.0-200.0), beta: 0.40 (-0.87-1.67)"
##             Value Std. Error  t value Pr(>|t|)
## (Intercept)   223  2.4391042 91.42701        0
## predictor       0  0.4200786  0.00000        1
## [1] "Intercept: 223.0 (218.2-227.8), beta: 0.00 (-0.82-0.82)"
##                   Value Std. Error    t value  Pr(>|t|)
## (Intercept) 239.4444444  2.1488187 111.430733 0.0000000
## predictor     0.5555556  0.3746224   1.482975 0.1383248
## [1] "Intercept: 239.4 (235.2-243.7), beta: 0.56 (-0.18-1.29)"
##              Value Std. Error    t value  Pr(>|t|)
## (Intercept) 271.00  3.0384559 89.1900388 0.0000000
## predictor    -0.25  0.4981497 -0.5018572 0.6158534
## [1] "Intercept: 271.0 (265.0-277.0), beta: -0.25 (-1.23-0.73)"
p_vals = p.adjust(p_vals, method="bonferroni")

ANOVA = anova(QR)

quant = quantile(table(data_censored$id))
print(paste0("n tests = ", nrow(data_censored), " (median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 1295 (median tests per patient: 11, IQR 11-11)"
signif_p = function(x, digits=1) {
  x = signif(x, digits)
  if (as.character(x) == "0") return("< 2e-16")
  else return(paste0("= ", x))
}

ggplot() + geom_line(aes_string("predictor", "value", group="id"), data_censored, alpha=0.2, color="darkgrey") + theme_pubr(base_family="Serif") + scale_x_continuous(expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + theme(legend.position = "none") + xlab(params$xlab) + ylab(params$unit) +
  geom_abline(intercept=coef(QR)[1,], slope=coef(QR)[2,], color=linecolor) +
  geom_text(data=data.frame(intercept=coef(QR)[1,], label=paste0(percentiles*100, "th percentile: β = ", round(coef(QR)[2,],1), ", ", "p.adj ", sapply(p_vals,signif_p))),
            mapping=aes(x=1,y=intercept, label=label), color=linecolor, hjust="left", vjust=1, family="Serif") +
  coord_cartesian(xlim=c(0,censor_after)) +
  geom_text(aes(x=x,y=y,label=label), data.frame(x=0.8*censor_after, y=0, label=paste0("ANOVA p ", signif_p(ANOVA$table$pvalue, 1))), vjust=-1.5, family="Serif") +
  geom_vline(xintercept=censor_after, color=linecolor, linetype=2)

Sensivity Analysis 2

Performance as a function of weeks since first test Minimum number of repetitions: 5, minimum number of weeks: 5

Cognition

params = list(
  test_code = "ips",
  test_metric_code = "correct_responses",
  unit = "SDMT: Correct Responses",
  unit_n = "patient",
  unit_time = "week",
  min_repetitions = 5,
  min_weeks = 5,
  predictor = "weeksSinceFirst",
  xlab = "Weeks",
  bounded.growth.confidence.interval = F,
  up_to_date = "2021-05-01"
)
library(data.table) # fread
library(parsedate) # parse_date
library(dplyr) # group_by
library(tibble) # deframe
library(lme4) # lmer
library(mgcv) # gamm
library(quantreg) # rq
library(patchwork) # plot_layout
library(gridExtra) # grid.arrange
library(ggpubr) # ggscatter
library(ggtext) # geom_text
library(sjPlot) # plot_model

# Download from: https://dataset.floodlightopen.com/public-blobs-prod/complete_dataset.csv
data = fread("complete_dataset.csv", data.table=F)

# Prepare dataset
data = data[data$testCode == params$test_code & !data$participantIsControl,]
data$time = parse_date(data$testStartedAt)
data = data[data$time <= as.POSIXct(params$up_to_date, tz="UTC"),] # only analyse data up to (excluding) params$up_to_date
data = data[!duplicated(data),] # sometimes contains true duplicates for some reason (even with the same testResultMetricId)

# For "Finger Pinching" hand_used has to be determined
if (params$test_code == "pinching") {
  library(tidyr) # pivot_wider
  # just one means either "hand" or "successful_pinches" values are missing, remove those
  table(table(data$time))
  data = data[!data$time %in% names(which(table(data$time)==1)), ]
  data = as.data.frame(pivot_wider(data, id_cols=c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time"), names_from="testMetricCode", values_from="testResultMetricValue"))
} else {
  data = data[data$testMetricCode == params$test_metric_code,]
  data$hand_used = NA
  data = data[c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time", "testResultMetricValue", "hand_used")]
}

colnames(data) = c("id", "control", "sex", "birthyear", "time", "value", "hand_used")
data$age = year(data$time)-data$birthyear # Estimate age
data = data[order(as.character(data$id)),]

# 0 result values are discarded
data = data[!is.na(data$value) & data$value != 0,]

# Consider those supposedly younger than 18 (minimum study age) and older than 90 as NA
data$age[data$age < 18 | data$age > 90] = NA

data$id_original = data$id
data$id = paste0(data$id, "_hand", data$hand_used)

data$day = as.IDate(data$time)
round = function(x, digits=0) sprintf(paste0("%.", digits, "f"), x)

no_x = theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
no_y = theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())

linecolor = "#c71138"

Participant selection

# At least x weeks & repetitions
for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "repetition"] = (1:n)-1
  data[subset, "weeksSinceFirst"] = as.numeric(difftime(data[subset, "time"], data[subset, "time"][1], unit="weeks"))
}

n_orig = nrow(data)
n_patients_orig = length(unique(data$id_original))
n_hands_orig = length(unique(data$id))

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(weeksSinceFirst), repetitions=last(repetition), .groups="keep")

Among the total n=1095 patients with n=5715 repetitions, the median length of participation is 0.0 weeks (IQR 0.0-8.6, range 0.0-151.6) and the median number of repetitions is 1 (IQR 1-4, range 1-106).

data = data[data$id %in% participation_duration$id[participation_duration$weeks >= params$min_weeks & participation_duration$repetitions+1 >= params$min_repetitions],]

for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "daysSinceLast"] = as.numeric(difftime(data[subset, "time"], c(data[subset, "time"][1], data[subset, "time"][1:(n-1)]), unit="days"))
}

participation_duration = data %>% group_by(id) %>% summarise(sex=first(sex), mean_age=mean(age), weeks=last(weeksSinceFirst), repetitions=last(repetition), median_intertest_interval=median(daysSinceLast), IQR_intertest_interval=IQR(daysSinceLast), .groups="keep")

data$predictor = data[,params$predictor]

Inclusion criteria: participation for at least 5 weeks and at least 5 repetitions performed per test, leading to the analysis of n=251 / 1095 patients and n=4388 / 5715 tests. Among those, the median length of participation is 16.5 weeks (IQR 10.1-45.9, range 5.0-151.6) and the median number of repetitions is 11 (IQR 6.5-18, range 5-106).

t(data.frame(
  n_patients = paste0(length(unique(data$id_original)), " / ", n_patients_orig, " (", round(length(unique(data$id_original))/n_patients_orig*100,1), "%)"),
  n_hands = paste0(length(unique(data$id)), " / ", n_hands_orig, " (", round(length(unique(data$id))/n_hands_orig*100,1), "%)"),
  n_tests = paste0(nrow(data), " / ", n_orig, " (", round(nrow(data)/n_orig*100,1), "%)"),
  percent_female = paste0(round(prop.table(table(participation_duration$sex == "female"))[[2]]*100, 1)),
  age = paste0(round(median(participation_duration$mean_age,na.rm=T),1), " (", round(quantile(participation_duration$mean_age, 0.25, na.rm=T),1), "-", round(quantile(participation_duration$mean_age, 0.75, na.rm=T),1), ", range ", round(min(participation_duration$mean_age, na.rm=T),1), "-", round(max(participation_duration$mean_age, na.rm=T),1), ")"),
  repetitions = paste0(median(participation_duration$repetitions)+1, " repetitions (IQR ", quantile(participation_duration$repetitions+1, 0.25), "-", quantile(participation_duration$repetitions+1, 0.75), ", range ", min(participation_duration$repetitions+1), "-", max(participation_duration$repetitions+1), ")"),
  median_intertest_interval = paste0(round(median(participation_duration$median_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$median_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$median_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$median_intertest_interval),1), "-", round(max(participation_duration$median_intertest_interval),1), ")"),
  IQR_intertest_interval = paste0(round(median(participation_duration$IQR_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$IQR_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$IQR_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$IQR_intertest_interval),1), "-", round(max(participation_duration$IQR_intertest_interval),1), ")"),
  weeks = paste0(round(median(participation_duration$weeks),1), " weeks (IQR ", round(quantile(participation_duration$weeks, 0.25),1), "-", round(quantile(participation_duration$weeks, 0.75),1), ", range ", round(min(participation_duration$weeks),1), "-", round(max(participation_duration$weeks),1), ")")
))
##                           [,1]                                         
## n_patients                "251 / 1095 (22.9%)"                         
## n_hands                   "251 / 1095 (22.9%)"                         
## n_tests                   "4388 / 5715 (76.8%)"                        
## percent_female            "70.5"                                       
## age                       "50.1 (42.0-58.0, range 20.0-79.0)"          
## repetitions               "11 repetitions (IQR 6.5-18, range 5-106)"   
## median_intertest_interval "7.6 days (IQR 7.1-9.5, range 6.7-87.1)"     
## IQR_intertest_interval    "2.8 days (IQR 0.7-8.1, range 0.0-133.8)"    
## weeks                     "16.5 weeks (IQR 10.1-45.9, range 5.0-151.6)"

Summary level analysis

Difference test

df = as.data.frame(data %>% group_by(id) %>% summarise(first=first(value), last=last(value), mean=mean(value), weeksSinceFirst=max(weeksSinceFirst), repetition=n(), first_age=first(age), last_age=last(age), mean_age=mean(age), .groups="keep") %>% mutate(diff=last-first))

df$predictor = df[, params$predictor]

test = t.test(df$last, df$first, paired=T)
test
## 
##  Paired t-test
## 
## data:  df$last and df$first
## t = 21.067, df = 250, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   9.086812 10.960997
## sample estimates:
## mean of the differences 
##                 10.0239
mod0 = lm(diff ~ I(mean_age/10) + I(first/10), df)
summ0 = summary(mod0)
summ0
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.4317  -4.4998  -0.0443   4.4295  24.4693 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     24.1237     3.8580   6.253 1.76e-09 ***
## I(mean_age/10)  -0.4513     0.4952  -0.911    0.363    
## I(first/10)     -3.0882     0.4879  -6.329 1.15e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.92 on 247 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1671, Adjusted R-squared:  0.1604 
## F-statistic: 24.78 on 2 and 247 DF,  p-value: 1.555e-10
mod = lm(diff ~ I(mean_age/10) + I(first/10) + log10(predictor), df)
confint(mod)
##                      2.5 %     97.5 %
## (Intercept)      14.246168 29.5212420
## I(mean_age/10)   -1.715353  0.2444261
## I(first/10)      -4.240819 -2.3287247
## log10(predictor)  1.085725  5.6093147
summ = summary(mod)
summ
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10) + log10(predictor), 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.467  -4.449  -0.073   4.569  22.652 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       21.8837     3.8776   5.644 4.57e-08 ***
## I(mean_age/10)    -0.7355     0.4975  -1.478  0.14060    
## I(first/10)       -3.2848     0.4854  -6.767 9.54e-11 ***
## log10(predictor)   3.3475     1.1483   2.915  0.00388 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.817 on 246 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1949, Adjusted R-squared:  0.1851 
## F-statistic: 19.86 on 3 and 246 DF,  p-value: 1.472e-11
# additional variance explained by predictor
#print(summ$r.squared - summ0$r.squared)

print(paste0("Average observed improvement over baseline: ", round(test$estimate/mean(df$first)*100, 1), " (", round(test$conf[1]/mean(df$first)*100, 1), "-", round(test$conf[2]/mean(df$first)*100, 1), ")"))
## [1] "Average observed improvement over baseline: 26.1 (23.7-28.6)"
lab.y = 1.1*mean(df$last)

p1 = ggbarplot(data.frame(Timepoint=rep(c("First","Mean","Last"),each=nrow(df)), value=c(df$first,df$mean,df$last)), "Timepoint", "value", add="mean_se", label=T, lab.nb.digits=1, lab.vjust=1.9, ylab=params$unit) + xlab("Score") #+ stat_compare_means(comparisons = list(c("First","Last")), paired=T, method="t.test", label.y=lab.y) + scale_y_continuous(expand=expansion(mult=c(0,0.1)))

p2 = plot_model(summ, show.values=T, vline.color = "grey", show.intercept=T, colors=linecolor, title=paste0("Difference from First to Last Score, R²=", round(summ$r.squared, 2)), axis.labels=rev(c("Intercept", "Age (per 10 years)", "First score (per 10)", paste0(params$xlab, " (log 10)"))), value.offset=0.3, show.p=F) + ylab("β estimates")

(p1 + p2) + plot_layout(widths=c(2,5)) & theme_pubr(base_family="Serif")

Confounders

p_age_first = ggscatter(df, "mean_age", "first", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("First score") + theme_pubr(base_family="Serif")

p_age_pred = ggscatter(df, "mean_age", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Mean age") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_age_last = ggscatter(df, "mean_age", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Last score") + theme_pubr(base_family="Serif")

p_age_diff = ggscatter(df, "mean_age", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_first_pred = ggscatter(df, "first", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("First score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_first_last = ggscatter(df, "first", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_first_diff = ggscatter(df, "first", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_pred_last = ggscatter(df, "predictor", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Last score") + theme_pubr(base_family="Serif")

p_pred_diff = ggscatter(df, "predictor", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_last_diff = ggscatter(df, "last", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Last score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_last_pred = ggscatter(df, "last", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Last score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")


p_age = gghistogram(df, "mean_age", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_first = gghistogram(df, "first", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_last = gghistogram(df, "last", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_pred = gghistogram(df, "predictor", bins=15) + scale_x_log10() + xlab(NULL) + theme_pubr(base_family="Serif")
p_diff = gghistogram(df, "diff", bins=15) + xlab("Difference first to last") + theme_pubr(base_family="Serif")

#(((p1+xlab(NULL)) + (p2+xlab(NULL)+ylab(NULL))) / ((p3+xlab(NULL)) + (p4+xlab(NULL)+ylab(NULL))) / ((p5) | (p6+ylab(NULL)))) & theme_pubr(base_family="Serif")

#(p_age_first | p_first) / (p_age_last | p_first_last | p_last) / (p_age_pred | p_first_pred | p_last_pred | p_pred) / (p_age_diff | p_first_diff | p_last_diff | p_pred_diff)

m <- matrix(NA, 5, 5)
m[lower.tri(m, diag = T)] <- 1:15
grid.arrange(grobs=list(
  p_age, p_age_first+xlab(NULL), p_age_pred+xlab(NULL), p_age_last+xlab(NULL), p_age_diff,
  p_first, p_first_pred+xlab(NULL)+ylab(""), p_first_last+xlab(NULL)+ylab(""), p_first_diff+ylab(""),
  p_pred, p_pred_last+xlab(NULL)+ylab(""), p_pred_diff+ylab(""),
  p_last, p_last_diff+ylab(""), 
  p_diff
), layout_matrix=m, heights=c(1,1,1,1,1.1))
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#GGally::ggpairs(df[c("mean_age", "first", "last", "predictor", "diff")])
#pairs(df[c("mean_age", "first", "last", "predictor", "diff")], upper.panel=NULL)
#corrplot::corrplot(cor(df[c("mean_age", "first", "last", "predictor", "diff")], use="complete.obs"))

Learning curve: Model selection

smoothing_spline = gamm(value ~ s(predictor, bs="ps"), random=list(id=~1), data=data)
summary(smoothing_spline$lme)
## Linear mixed-effects model fit by maximum likelihood
##  Data: strip.offset(mf) 
##        AIC      BIC    logLik
##   25257.96 25289.89 -12623.98
## 
## Random effects:
##  Formula: ~Xr - 1 | g
##  Structure: pdIdnot
##              Xr1      Xr2      Xr3      Xr4      Xr5      Xr6      Xr7      Xr8
## StdDev: 5.500533 5.500533 5.500533 5.500533 5.500533 5.500533 5.500533 5.500533
## 
##  Formula: ~1 | id %in% g
##         (Intercept) Residual
## StdDev:    9.567976 3.769933
## 
## Fixed effects: y ~ X - 1 
##                     Value Std.Error   DF  t-value p-value
## X(Intercept)     47.44301  0.610158 4136 77.75535       0
## Xs(predictor)Fx1 32.67583  7.724983 4136  4.22989       0
##  Correlation: 
##                  X(Int)
## Xs(predictor)Fx1 0.004 
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -7.48151438 -0.54600618  0.04704746  0.61340318  4.56413645 
## 
## Number of Observations: 4388
## Number of Groups: 
##         g id %in% g 
##         1       251
summary(smoothing_spline$gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## value ~ s(predictor, bs = "ps")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  47.4430     0.6101   77.76   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df     F p-value    
## s(predictor) 8.162  8.162 305.6  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0813   
##   Scale est. = 14.212    n = 4388
REM_linear = lmer(value ~ (1|id) + predictor, data)
equ_linear = function(t) fixef(REM_linear)[1] + fixef(REM_linear)[2]*t
summary(REM_linear)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor
##    Data: data
## 
## REML criterion at convergence: 25931.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.2644 -0.5343  0.0868  0.6207  4.2340 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 92.68    9.627   
##  Residual             16.91    4.112   
## Number of obs: 4388, groups:  id, 251
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) 43.837480   0.615055   71.27
## predictor    0.131058   0.003457   37.91
## 
## Correlation of Fixed Effects:
##           (Intr)
## predictor -0.076
REM_quadratic = lmer(value ~ (1|id) + predictor + I(predictor^2), data)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
equ_quadratic = function(t) fixef(REM_quadratic)[1] + fixef(REM_quadratic)[2]*t + fixef(REM_quadratic)[3]*t^2
summary(REM_quadratic)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor + I(predictor^2)
##    Data: data
## 
## REML criterion at convergence: 25781.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.3328 -0.5368  0.0711  0.6172  4.1755 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 92.09    9.596   
##  Residual             16.25    4.032   
## Number of obs: 4388, groups:  id, 251
## 
## Fixed effects:
##                  Estimate Std. Error t value
## (Intercept)     4.318e+01  6.149e-01   70.23
## predictor       2.220e-01  7.748e-03   28.65
## I(predictor^2) -1.051e-03  8.056e-05  -13.05
## 
## Correlation of Fixed Effects:
##             (Intr) prdctr
## predictor   -0.106       
## I(prdctr^2)  0.081 -0.899
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
REM_bounded = nlmer(value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0|id) + (yf|id), data = data, start=c(yf=40, y0=20, log_alpha=-1))
y0=fixef(REM_bounded)["y0"]
yf=fixef(REM_bounded)["yf"]
log_alpha=fixef(REM_bounded)["log_alpha"]
equ_bounded = function(t, yf=fixef(REM_bounded)[["yf"]], y0=fixef(REM_bounded)[["y0"]], log_alpha=fixef(REM_bounded)[["log_alpha"]]) yf+(y0-yf)*exp(-exp(log_alpha)*t)
summary(REM_bounded)
## Warning in vcov.merMod(object, use.hessian = use.hessian): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Warning in vcov.merMod(object, correlation = correlation, sigm = sig): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Nonlinear mixed model fit by maximum likelihood  ['nlmerMod']
## Formula: value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0 | id) + (yf |  
##     id)
##    Data: data
## 
##      AIC      BIC   logLik deviance df.resid 
##  25337.0  25375.4 -12662.5  25325.0     4382 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.6692 -0.5176  0.0506  0.5950  3.7020 
## 
## Random effects:
##  Groups   Name Variance Std.Dev.
##  id       y0    96.43    9.820  
##  id.1     yf   137.63   11.732  
##  Residual       12.99    3.604  
## Number of obs: 4388, groups:  id, 251
## 
## Fixed effects:
##           Estimate Std. Error t value
## yf        53.49770    0.85777   62.37
## y0        41.01897    0.63633   64.46
## log_alpha -2.72957    0.04814  -56.70
## 
## Correlation of Fixed Effects:
##           yf     y0    
## y0        -0.010       
## log_alpha -0.362 -0.086
exp(log_alpha)
##  log_alpha 
## 0.06524728
cat("Average improvement over baseline: ", (yf-y0)/y0*100)
## Average improvement over baseline:  30.42185
RMSE = sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) sqrt(mean(resid(mod)^2))) # RMSE
RMSE
## [1] 3.994790 3.915758 3.658825 3.411218
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) mean(abs(resid(mod)))) # MAE
## [1] 2.996292 2.930462 2.724674 2.565703
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) extractAIC(mod)) # edf & AIC: smoothing_spline$lme always has edf 5
##          [,1]     [,2]     [,3]     [,4]
## [1,]     4.00     5.00     5.00     6.00
## [2,] 25930.98 25766.03 25257.96 25337.03
edf = sapply(list(REM_linear, REM_quadratic, smoothing_spline$gam, REM_bounded), function(mod) { nrow(data)-df.residual(mod) }) # while smoothing_spline$gam often has much higher edf
edf
## [1] 4.000000 5.000000 9.161586 6.000000
RMSE = round(RMSE,1)
edf = round(edf,1)

Plot

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(predictor), .groups="keep") %>% deframe()
remaining_participants_bins = seq(0,350,by=10)
remaining_participants = data.frame(x=remaining_participants_bins, text=sapply(remaining_participants_bins, function(x) sum(participation_duration>=x)))

xmax = remaining_participants$x[which(remaining_participants$text<10)[1]]

range_90p =quantile(data$value, probs=c(0.05,0.95))

p1 = ggplot() +
  geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.1) +
  geom_line(aes(x,y), data.frame(x=0:xmax, y=predict(smoothing_spline$gam, newdata=data.frame(predictor=0:xmax))), linetype="longdash", size=1) + xlim(0,xmax) + ylim(range_90p[1], range_90p[2]) +
  stat_function(fun=equ_linear, color="blue", linetype="dotted", size=1) + 
  stat_function(fun=equ_quadratic, color="green4", linetype="dashed", size=1) + 
  stat_function(fun=equ_bounded, color=linecolor, size=1) + 
  theme_pubr(base_family="Serif") + no_x + xlab(NULL) + ylab(params$unit) +
  geom_richtext(aes(x,y,label=label,hjust=1), data.frame(x=0.8*xmax, y=range_90p[2]-(range_90p[2]-range_90p[1])/1.3, label=paste0("Model RMSE (edf):<br><span style='color:#0000ff'>····· Linear: ", RMSE[1], " (", edf[1], ") </span><br><span style='color:#008b00'>- - - Quadratic: ", RMSE[2], " (", edf[2], ") </span><br><span style='color:#000000'>— — Smoothing spline: ", RMSE[3], " (", edf[3], ") </span><br><span style='color:", linecolor, "'>— Bounded growth: ", RMSE[4], " (", edf[4], ") </span>")), family="Serif")

p2 = ggplot(remaining_participants[remaining_participants$x<=xmax,]) +
  geom_text(aes(x=x,y="A",label=text), family="Serif") +
  theme_pubr(base_family="Serif") + no_y + xlab(params$xlab) + ylab(paste0("Remaining \n ", params$unit_n, "s")) +
  scale_x_continuous(breaks=seq(0,xmax,by=10))

(p1 / p2) + plot_layout(heights=c(0.9,0.1))
## Warning: Removed 321 row(s) containing missing values (geom_path).

if (params$bounded.growth.confidence.interval) conf = confint.merMod(REM_bounded, c("y0","yf","log_alpha"), method="profile")
if (params$bounded.growth.confidence.interval) {
  print(conf)
  print(exp(conf[3,]))
  print(paste0("Average boundary improvement over baseline: ", round((yf-y0)/y0*100, 1), " (", round((conf[1,1]-conf[2,1])/conf[2,1]*100, 1), "-", round((conf[1,2]-conf[2,2])/conf[2,2]*100, 1), ")"))
}

Bounded growth model

equ_diff_REM_bounded = function(t) exp(log_alpha)*(yf-y0)*exp(exp(log_alpha)*-t)
equ_diff_get_time_REM_bounded = function(target_slope) log(exp(log_alpha)*(yf-y0)/target_slope)/exp(log_alpha)

equ_bounded_get_x = function(target_value) exp(-log_alpha)*log((yf-y0)/(yf-target_value))

growth_percentiles = c(0, 0.5, 0.9)
names_percentiles = c("baseline", "half-practice point", "90% practice")
selected_timepoints = equ_bounded_get_x(y0+(yf-y0)*growth_percentiles)
example_slopes_bounded = data.frame(
  x=selected_timepoints,
  y=equ_bounded(selected_timepoints),
  label=paste0("y=", round(equ_bounded(selected_timepoints),1), ", m=", signif(equ_diff_REM_bounded(selected_timepoints),2), " at ", params$unit_time, " ", round(selected_timepoints,0), ", ", names_percentiles),
  vjust=1.5
)
example_slopes_bounded = rbind(example_slopes_bounded, list(x=0.83*xmax, y=yf, label=paste0("boundary: ", round(yf, 1)), vjust=-1.0))

if (params$bounded.growth.confidence.interval) ribbon = data.frame(x=seq(0,xmax,0.05), ymin=equ_bounded(seq(0,xmax,0.05), conf["yf","2.5 %"], conf["y0","2.5 %"], conf["log_alpha","2.5 %"]), ymax=equ_bounded(seq(0,xmax,0.05), conf["yf","97.5 %"], conf["y0","97.5 %"], conf["log_alpha","97.5 %"]))

quant = quantile(table(data$id))
print(paste0("n tests = ", nrow(data), " (n ", params$unit_n, "s = ", length(unique(data$id)), ", median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 4388 (n patients = 251, median tests per patient: 11, IQR 6.5-18)"
p1 = ggplot() + geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.2) +
  theme_pubr(base_family="Serif") + scale_x_continuous(limits = c(0,xmax), expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + xlab(params$xlab) + ylab(params$unit) +
  geom_vline(xintercept=example_slopes_bounded[2,"x"], color=linecolor, linetype=2) +
  stat_function(fun=equ_bounded, color=linecolor, size=1) +
  geom_point(data=example_slopes_bounded[1:(nrow(example_slopes_bounded)-1),], aes(x,y), color=linecolor, size=5) +
  geom_text(data=example_slopes_bounded, aes(x,y,label=label, vjust=vjust), color=linecolor, hjust=-0.01, family="Serif")

if (params$bounded.growth.confidence.interval) p1 = p1 + geom_ribbon(aes(x=x, ymin=ymin, ymax=ymax), ribbon, fill=linecolor, alpha=0.3)

p1
## Warning: Removed 110 row(s) containing missing values (geom_path).

Quantile regression

if (is.null(params$censor_after)) {
  censor_after = as.integer(round(selected_timepoints[2])) # half-practice point
} else {
  censor_after = params$censor_after
}
data_censored = data[data$predictor <= censor_after,]

percentiles = c(0.05,0.25,0.5,0.75,0.95)

QR = rq(value ~ predictor, tau=percentiles, data_censored)

p_vals = sapply(1:length(summary(QR)), function(i) {
  summ = coef(summary(QR, se="ker")[[i]])
  print(summ)
  print(paste0("Intercept: ", round(summ[1,1],1), " (", round(summ[1,1]-1.96*summ[1,2],1), "-", round(summ[1,1]+1.96*summ[1,2],1), "), beta: ", round(summ[2,1],2), " (", round(summ[2,1]-1.96*summ[2,2],2), "-", round(summ[2,1]+1.96*summ[2,2],2), ")"))
  summ[2,4]
})
##                  Value Std. Error   t value     Pr(>|t|)
## (Intercept) 23.7452499  0.9186436 25.848163 0.000000e+00
## predictor    0.9067361  0.1455036  6.231711 5.661185e-10
## [1] "Intercept: 23.7 (21.9-25.5), beta: 0.91 (0.62-1.19)"
##                  Value Std. Error   t value     Pr(>|t|)
## (Intercept) 35.1390203 0.53341165 65.875989 0.000000e+00
## predictor    0.7002014 0.09320421  7.512551 8.837375e-14
## [1] "Intercept: 35.1 (34.1-36.2), beta: 0.70 (0.52-0.88)"
##                  Value Std. Error   t value Pr(>|t|)
## (Intercept) 41.2014647 0.49153361 83.822274        0
## predictor    0.7994884 0.09101535  8.784105        0
## [1] "Intercept: 41.2 (40.2-42.2), beta: 0.80 (0.62-0.98)"
##                 Value Std. Error   t value     Pr(>|t|)
## (Intercept) 48.238810  0.5496528 87.762330 0.000000e+00
## predictor    0.751973  0.1001570  7.507944 9.148238e-14
## [1] "Intercept: 48.2 (47.2-49.3), beta: 0.75 (0.56-0.95)"
##                  Value Std. Error   t value    Pr(>|t|)
## (Intercept) 56.2309719  0.7329476 76.718955 0.00000e+00
## predictor    0.8327975  0.1477032  5.638315 1.97312e-08
## [1] "Intercept: 56.2 (54.8-57.7), beta: 0.83 (0.54-1.12)"
p_vals = p.adjust(p_vals, method="bonferroni")

ANOVA = anova(QR)

quant = quantile(table(data_censored$id))
print(paste0("n tests = ", nrow(data_censored), " (median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 1913 (median tests per patient: 7, IQR 5-11)"
signif_p = function(x, digits=1) {
  x = signif(x, digits)
  if (as.character(x) == "0") return("< 2e-16")
  else return(paste0("= ", x))
}

ggplot() + geom_line(aes_string("predictor", "value", group="id"), data_censored, alpha=0.2, color="darkgrey") + theme_pubr(base_family="Serif") + scale_x_continuous(expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + theme(legend.position = "none") + xlab(params$xlab) + ylab(params$unit) +
  geom_abline(intercept=coef(QR)[1,], slope=coef(QR)[2,], color=linecolor) +
  geom_text(data=data.frame(intercept=coef(QR)[1,], label=paste0(percentiles*100, "th percentile: β = ", round(coef(QR)[2,],1), ", ", "p.adj ", sapply(p_vals,signif_p))),
            mapping=aes(x=1,y=intercept, label=label), color=linecolor, hjust="left", vjust=1, family="Serif") +
  coord_cartesian(xlim=c(0,censor_after)) +
  geom_text(aes(x=x,y=y,label=label), data.frame(x=0.8*censor_after, y=0, label=paste0("ANOVA p ", signif_p(ANOVA$table$pvalue, 1))), vjust=-1.5, family="Serif") +
  geom_vline(xintercept=censor_after, color=linecolor, linetype=2)

Dexterity

params = list(
  test_code = "pinching",
  test_metric_code = "successful_pinches",
  unit = "Pinching: Successful Pinches",
  unit_n = "hand",
  unit_time = "week",
  min_repetitions = 5,
  min_weeks = 5,
  predictor = "weeksSinceFirst",
  xlab = "Weeks",
  bounded.growth.confidence.interval = F,
  up_to_date = "2021-05-01"
)
library(data.table) # fread
library(parsedate) # parse_date
library(dplyr) # group_by
library(tibble) # deframe
library(lme4) # lmer
library(mgcv) # gamm
library(quantreg) # rq
library(patchwork) # plot_layout
library(gridExtra) # grid.arrange
library(ggpubr) # ggscatter
library(ggtext) # geom_text
library(sjPlot) # plot_model

# Download from: https://dataset.floodlightopen.com/public-blobs-prod/complete_dataset.csv
data = fread("complete_dataset.csv", data.table=F)

# Prepare dataset
data = data[data$testCode == params$test_code & !data$participantIsControl,]
data$time = parse_date(data$testStartedAt)
data = data[data$time <= as.POSIXct(params$up_to_date, tz="UTC"),] # only analyse data up to (excluding) params$up_to_date
data = data[!duplicated(data),] # sometimes contains true duplicates for some reason (even with the same testResultMetricId)

# For "Finger Pinching" hand_used has to be determined
if (params$test_code == "pinching") {
  library(tidyr) # pivot_wider
  # just one means either "hand" or "successful_pinches" values are missing, remove those
  table(table(data$time))
  data = data[!data$time %in% names(which(table(data$time)==1)), ]
  data = as.data.frame(pivot_wider(data, id_cols=c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time"), names_from="testMetricCode", values_from="testResultMetricValue"))
} else {
  data = data[data$testMetricCode == params$test_metric_code,]
  data$hand_used = NA
  data = data[c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time", "testResultMetricValue", "hand_used")]
}

colnames(data) = c("id", "control", "sex", "birthyear", "time", "value", "hand_used")
data$age = year(data$time)-data$birthyear # Estimate age
data = data[order(as.character(data$id)),]

# 0 result values are discarded
data = data[!is.na(data$value) & data$value != 0,]

# Consider those supposedly younger than 18 (minimum study age) and older than 90 as NA
data$age[data$age < 18 | data$age > 90] = NA

data$id_original = data$id
data$id = paste0(data$id, "_hand", data$hand_used)

data$day = as.IDate(data$time)
round = function(x, digits=0) sprintf(paste0("%.", digits, "f"), x)

no_x = theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
no_y = theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())

linecolor = "#c71138"

Participant selection

# At least x weeks & repetitions
for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "repetition"] = (1:n)-1
  data[subset, "weeksSinceFirst"] = as.numeric(difftime(data[subset, "time"], data[subset, "time"][1], unit="weeks"))
}

n_orig = nrow(data)
n_patients_orig = length(unique(data$id_original))
n_hands_orig = length(unique(data$id))

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(weeksSinceFirst), repetitions=last(repetition), .groups="keep")

Among the total n=1816 patients with n=20695 repetitions, the median length of participation is 0.6 weeks (IQR 0.0-8.1, range 0.0-151.7) and the median number of repetitions is 2 (IQR 1-7, range 1-370).

data = data[data$id %in% participation_duration$id[participation_duration$weeks >= params$min_weeks & participation_duration$repetitions+1 >= params$min_repetitions],]

for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "daysSinceLast"] = as.numeric(difftime(data[subset, "time"], c(data[subset, "time"][1], data[subset, "time"][1:(n-1)]), unit="days"))
}

participation_duration = data %>% group_by(id) %>% summarise(sex=first(sex), mean_age=mean(age), weeks=last(weeksSinceFirst), repetitions=last(repetition), median_intertest_interval=median(daysSinceLast), IQR_intertest_interval=IQR(daysSinceLast), .groups="keep")

data$predictor = data[,params$predictor]

Inclusion criteria: participation for at least 5 weeks and at least 5 repetitions performed per test, leading to the analysis of n=252 / 1059 patients , 470 / 1816 hands and n=17945 / 20695 tests. Among those, the median length of participation is 15.6 weeks (IQR 9.9-43.4, range 5.0-151.7) and the median number of repetitions is 18 (IQR 10-42.75, range 5-370).

t(data.frame(
  n_patients = paste0(length(unique(data$id_original)), " / ", n_patients_orig, " (", round(length(unique(data$id_original))/n_patients_orig*100,1), "%)"),
  n_hands = paste0(length(unique(data$id)), " / ", n_hands_orig, " (", round(length(unique(data$id))/n_hands_orig*100,1), "%)"),
  n_tests = paste0(nrow(data), " / ", n_orig, " (", round(nrow(data)/n_orig*100,1), "%)"),
  percent_female = paste0(round(prop.table(table(participation_duration$sex == "female"))[[2]]*100, 1)),
  age = paste0(round(median(participation_duration$mean_age,na.rm=T),1), " (", round(quantile(participation_duration$mean_age, 0.25, na.rm=T),1), "-", round(quantile(participation_duration$mean_age, 0.75, na.rm=T),1), ", range ", round(min(participation_duration$mean_age, na.rm=T),1), "-", round(max(participation_duration$mean_age, na.rm=T),1), ")"),
  repetitions = paste0(median(participation_duration$repetitions)+1, " repetitions (IQR ", quantile(participation_duration$repetitions+1, 0.25), "-", quantile(participation_duration$repetitions+1, 0.75), ", range ", min(participation_duration$repetitions+1), "-", max(participation_duration$repetitions+1), ")"),
  median_intertest_interval = paste0(round(median(participation_duration$median_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$median_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$median_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$median_intertest_interval),1), "-", round(max(participation_duration$median_intertest_interval),1), ")"),
  IQR_intertest_interval = paste0(round(median(participation_duration$IQR_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$IQR_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$IQR_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$IQR_intertest_interval),1), "-", round(max(participation_duration$IQR_intertest_interval),1), ")"),
  weeks = paste0(round(median(participation_duration$weeks),1), " weeks (IQR ", round(quantile(participation_duration$weeks, 0.25),1), "-", round(quantile(participation_duration$weeks, 0.75),1), ", range ", round(min(participation_duration$weeks),1), "-", round(max(participation_duration$weeks),1), ")")
))
##                           [,1]                                        
## n_patients                "252 / 1059 (23.8%)"                        
## n_hands                   "470 / 1816 (25.9%)"                        
## n_tests                   "17945 / 20695 (86.7%)"                     
## percent_female            "71.7"                                      
## age                       "49.8 (41.7-57.5, range 20.0-79.0)"         
## repetitions               "18 repetitions (IQR 10-42.75, range 5-370)"
## median_intertest_interval "3.1 days (IQR 2.1-4.9, range 1.9-39.2)"    
## IQR_intertest_interval    "3.0 days (IQR 1.0-7.0, range 0.0-101.5)"   
## weeks                     "15.6 weeks (IQR 9.9-43.4, range 5.0-151.7)"

Summary level analysis

Difference test

df = as.data.frame(data %>% group_by(id) %>% summarise(first=first(value), last=last(value), mean=mean(value), weeksSinceFirst=max(weeksSinceFirst), repetition=n(), first_age=first(age), last_age=last(age), mean_age=mean(age), .groups="keep") %>% mutate(diff=last-first))

df$predictor = df[, params$predictor]

test = t.test(df$last, df$first, paired=T)
test
## 
##  Paired t-test
## 
## data:  df$last and df$first
## t = 21.525, df = 469, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  13.41214 16.10701
## sample estimates:
## mean of the differences 
##                14.75957
mod0 = lm(diff ~ I(mean_age/10) + I(first/10), df)
summ0 = summary(mod0)
summ0
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.110  -9.844   0.538  10.620  32.248 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     33.9357     3.6391   9.325   <2e-16 ***
## I(mean_age/10)  -1.1000     0.6020  -1.827   0.0683 .  
## I(first/10)     -5.2868     0.5221 -10.127   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.48 on 465 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.182,  Adjusted R-squared:  0.1785 
## F-statistic: 51.73 on 2 and 465 DF,  p-value: < 2.2e-16
mod = lm(diff ~ I(mean_age/10) + I(first/10) + log10(predictor), df)
confint(mod)
##                      2.5 %     97.5 %
## (Intercept)      18.131290 33.2627551
## I(mean_age/10)   -2.863894 -0.5258998
## I(first/10)      -6.360843 -4.3675116
## log10(predictor)  5.565771 11.9483226
summ = summary(mod)
summ
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10) + log10(predictor), 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.956  -8.892   0.457   9.808  31.408 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       25.6970     3.8501   6.674 7.10e-11 ***
## I(mean_age/10)    -1.6949     0.5949  -2.849  0.00458 ** 
## I(first/10)       -5.3642     0.5072 -10.576  < 2e-16 ***
## log10(predictor)   8.7570     1.6240   5.392 1.11e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.09 on 464 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.2302, Adjusted R-squared:  0.2253 
## F-statistic: 46.26 on 3 and 464 DF,  p-value: < 2.2e-16
# additional variance explained by predictor
#print(summ$r.squared - summ0$r.squared)

print(paste0("Average observed improvement over baseline: ", round(test$estimate/mean(df$first)*100, 1), " (", round(test$conf[1]/mean(df$first)*100, 1), "-", round(test$conf[2]/mean(df$first)*100, 1), ")"))
## [1] "Average observed improvement over baseline: 56.6 (51.4-61.8)"
lab.y = 1.1*mean(df$last)

p1 = ggbarplot(data.frame(Timepoint=rep(c("First","Mean","Last"),each=nrow(df)), value=c(df$first,df$mean,df$last)), "Timepoint", "value", add="mean_se", label=T, lab.nb.digits=1, lab.vjust=1.9, ylab=params$unit) + xlab("Score") #+ stat_compare_means(comparisons = list(c("First","Last")), paired=T, method="t.test", label.y=lab.y) + scale_y_continuous(expand=expansion(mult=c(0,0.1)))

p2 = plot_model(summ, show.values=T, vline.color = "grey", show.intercept=T, colors=linecolor, title=paste0("Difference from First to Last Score, R²=", round(summ$r.squared, 2)), axis.labels=rev(c("Intercept", "Age (per 10 years)", "First score (per 10)", paste0(params$xlab, " (log 10)"))), value.offset=0.3, show.p=F) + ylab("β estimates")

(p1 + p2) + plot_layout(widths=c(2,5)) & theme_pubr(base_family="Serif")

Confounders

p_age_first = ggscatter(df, "mean_age", "first", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("First score") + theme_pubr(base_family="Serif")

p_age_pred = ggscatter(df, "mean_age", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Mean age") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_age_last = ggscatter(df, "mean_age", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Last score") + theme_pubr(base_family="Serif")

p_age_diff = ggscatter(df, "mean_age", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_first_pred = ggscatter(df, "first", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("First score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_first_last = ggscatter(df, "first", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_first_diff = ggscatter(df, "first", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_pred_last = ggscatter(df, "predictor", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Last score") + theme_pubr(base_family="Serif")

p_pred_diff = ggscatter(df, "predictor", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_last_diff = ggscatter(df, "last", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Last score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_last_pred = ggscatter(df, "last", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Last score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")


p_age = gghistogram(df, "mean_age", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_first = gghistogram(df, "first", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_last = gghistogram(df, "last", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_pred = gghistogram(df, "predictor", bins=15) + scale_x_log10() + xlab(NULL) + theme_pubr(base_family="Serif")
p_diff = gghistogram(df, "diff", bins=15) + xlab("Difference first to last") + theme_pubr(base_family="Serif")

#(((p1+xlab(NULL)) + (p2+xlab(NULL)+ylab(NULL))) / ((p3+xlab(NULL)) + (p4+xlab(NULL)+ylab(NULL))) / ((p5) | (p6+ylab(NULL)))) & theme_pubr(base_family="Serif")

#(p_age_first | p_first) / (p_age_last | p_first_last | p_last) / (p_age_pred | p_first_pred | p_last_pred | p_pred) / (p_age_diff | p_first_diff | p_last_diff | p_pred_diff)

m <- matrix(NA, 5, 5)
m[lower.tri(m, diag = T)] <- 1:15
grid.arrange(grobs=list(
  p_age, p_age_first+xlab(NULL), p_age_pred+xlab(NULL), p_age_last+xlab(NULL), p_age_diff,
  p_first, p_first_pred+xlab(NULL)+ylab(""), p_first_last+xlab(NULL)+ylab(""), p_first_diff+ylab(""),
  p_pred, p_pred_last+xlab(NULL)+ylab(""), p_pred_diff+ylab(""),
  p_last, p_last_diff+ylab(""), 
  p_diff
), layout_matrix=m, heights=c(1,1,1,1,1.1))
## Warning: Removed 2 rows containing non-finite values (stat_bin).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#GGally::ggpairs(df[c("mean_age", "first", "last", "predictor", "diff")])
#pairs(df[c("mean_age", "first", "last", "predictor", "diff")], upper.panel=NULL)
#corrplot::corrplot(cor(df[c("mean_age", "first", "last", "predictor", "diff")], use="complete.obs"))

Learning curve: Model selection

smoothing_spline = gamm(value ~ s(predictor, bs="ps"), random=list(id=~1), data=data)
summary(smoothing_spline$lme)
## Linear mixed-effects model fit by maximum likelihood
##  Data: strip.offset(mf) 
##        AIC      BIC    logLik
##   126134.9 126173.9 -63062.47
## 
## Random effects:
##  Formula: ~Xr - 1 | g
##  Structure: pdIdnot
##              Xr1      Xr2      Xr3      Xr4      Xr5      Xr6      Xr7      Xr8
## StdDev: 9.195359 9.195359 9.195359 9.195359 9.195359 9.195359 9.195359 9.195359
## 
##  Formula: ~1 | id %in% g
##         (Intercept) Residual
## StdDev:    12.21739 7.700065
## 
## Fixed effects: y ~ X - 1 
##                     Value Std.Error    DF  t-value p-value
## X(Intercept)     40.74406  0.573616 17474 71.03015   0e+00
## Xs(predictor)Fx1 32.41981  9.655043 17474  3.35781   8e-04
##  Correlation: 
##                  X(Int)
## Xs(predictor)Fx1 0.006 
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -6.11799178 -0.48750847  0.07971535  0.59566834  6.73912718 
## 
## Number of Observations: 17945
## Number of Groups: 
##         g id %in% g 
##         1       470
summary(smoothing_spline$gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## value ~ s(predictor, bs = "ps")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  40.7441     0.5736   71.03   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df     F p-value    
## s(predictor) 8.532  8.532 577.8  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.143   
##   Scale est. = 59.291    n = 17945
REM_linear = lmer(value ~ (1|id) + predictor, data)
equ_linear = function(t) fixef(REM_linear)[1] + fixef(REM_linear)[2]*t
summary(REM_linear)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor
##    Data: data
## 
## REML criterion at convergence: 127724.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.2014 -0.4952  0.0919  0.6163  5.7645 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 154.28   12.421  
##  Residual              65.09    8.068  
## Number of obs: 17945, groups:  id, 470
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) 34.922387   0.582740   59.93
## predictor    0.191399   0.003549   53.94
## 
## Correlation of Fixed Effects:
##           (Intr)
## predictor -0.073
REM_quadratic = lmer(value ~ (1|id) + predictor + I(predictor^2), data)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
equ_quadratic = function(t) fixef(REM_quadratic)[1] + fixef(REM_quadratic)[2]*t + fixef(REM_quadratic)[3]*t^2
summary(REM_quadratic)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor + I(predictor^2)
##    Data: data
## 
## REML criterion at convergence: 127394.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.1206 -0.4867  0.0838  0.6074  6.0369 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 152.38   12.34   
##  Residual              63.84    7.99   
## Number of obs: 17945, groups:  id, 470
## 
## Fixed effects:
##                  Estimate Std. Error t value
## (Intercept)     3.412e+01  5.807e-01   58.76
## predictor       3.181e-01  7.625e-03   41.71
## I(predictor^2) -1.472e-03  7.862e-05  -18.72
## 
## Correlation of Fixed Effects:
##             (Intr) prdctr
## predictor   -0.099       
## I(prdctr^2)  0.074 -0.887
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
REM_bounded = nlmer(value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0|id) + (yf|id), data = data, start=c(yf=40, y0=20, log_alpha=-1))
y0=fixef(REM_bounded)["y0"]
yf=fixef(REM_bounded)["yf"]
log_alpha=fixef(REM_bounded)["log_alpha"]
equ_bounded = function(t, yf=fixef(REM_bounded)[["yf"]], y0=fixef(REM_bounded)[["y0"]], log_alpha=fixef(REM_bounded)[["log_alpha"]]) yf+(y0-yf)*exp(-exp(log_alpha)*t)
summary(REM_bounded)
## Nonlinear mixed model fit by maximum likelihood  ['nlmerMod']
## Formula: value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0 | id) + (yf |  
##     id)
##    Data: data
## 
##      AIC      BIC   logLik deviance df.resid 
## 123637.5 123684.2 -61812.7 123625.5    17939 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.9076 -0.4911  0.0795  0.6062  5.2560 
## 
## Random effects:
##  Groups   Name Variance Std.Dev.
##  id       y0   160.28   12.660  
##  id.1     yf   331.88   18.218  
##  Residual       48.81    6.986  
## Number of obs: 17945, groups:  id, 470
## 
## Fixed effects:
##           Estimate Std. Error t value
## yf        47.74291    0.99030   48.21
## y0        31.30313    0.60577   51.67
## log_alpha -2.71322    0.04377  -61.99
## 
## Correlation of Fixed Effects:
##           yf     y0    
## y0        -0.005       
## log_alpha -0.370 -0.110
exp(log_alpha)
##  log_alpha 
## 0.06632308
cat("Average improvement over baseline: ", (yf-y0)/y0*100)
## Average improvement over baseline:  52.51801
RMSE = sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) sqrt(mean(resid(mod)^2))) # RMSE
RMSE
## [1] 7.964285 7.886909 7.599561 6.820973
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) mean(abs(resid(mod)))) # MAE
## [1] 5.907051 5.824308 5.597598 5.039468
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) extractAIC(mod)) # edf & AIC: smoothing_spline$lme always has edf 5
##          [,1]     [,2]     [,3]     [,4]
## [1,]      4.0      5.0      5.0      6.0
## [2,] 127723.6 127378.6 126134.9 123637.5
edf = sapply(list(REM_linear, REM_quadratic, smoothing_spline$gam, REM_bounded), function(mod) { nrow(data)-df.residual(mod) }) # while smoothing_spline$gam often has much higher edf
edf
## [1] 4.000000 5.000000 9.532433 6.000000
RMSE = round(RMSE,1)
edf = round(edf,1)

Plot

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(predictor), .groups="keep") %>% deframe()
remaining_participants_bins = seq(0,350,by=10)
remaining_participants = data.frame(x=remaining_participants_bins, text=sapply(remaining_participants_bins, function(x) sum(participation_duration>=x)))

xmax = remaining_participants$x[which(remaining_participants$text<10)[1]]

range_90p =quantile(data$value, probs=c(0.05,0.95))

p1 = ggplot() +
  geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.1) +
  geom_line(aes(x,y), data.frame(x=0:xmax, y=predict(smoothing_spline$gam, newdata=data.frame(predictor=0:xmax))), linetype="longdash", size=1) + xlim(0,xmax) + ylim(range_90p[1], range_90p[2]) +
  stat_function(fun=equ_linear, color="blue", linetype="dotted", size=1) + 
  stat_function(fun=equ_quadratic, color="green4", linetype="dashed", size=1) + 
  stat_function(fun=equ_bounded, color=linecolor, size=1) + 
  theme_pubr(base_family="Serif") + no_x + xlab(NULL) + ylab(params$unit) +
  geom_richtext(aes(x,y,label=label,hjust=1), data.frame(x=0.8*xmax, y=range_90p[2]-(range_90p[2]-range_90p[1])/1.3, label=paste0("Model RMSE (edf):<br><span style='color:#0000ff'>····· Linear: ", RMSE[1], " (", edf[1], ") </span><br><span style='color:#008b00'>- - - Quadratic: ", RMSE[2], " (", edf[2], ") </span><br><span style='color:#000000'>— — Smoothing spline: ", RMSE[3], " (", edf[3], ") </span><br><span style='color:", linecolor, "'>— Bounded growth: ", RMSE[4], " (", edf[4], ") </span>")), family="Serif")

p2 = ggplot(remaining_participants[remaining_participants$x<=xmax,]) +
  geom_text(aes(x=x,y="A",label=text), family="Serif") +
  theme_pubr(base_family="Serif") + no_y + xlab(params$xlab) + ylab(paste0("Remaining \n ", params$unit_n, "s")) +
  scale_x_continuous(breaks=seq(0,xmax,by=10))

(p1 / p2) + plot_layout(heights=c(0.9,0.1))
## Warning: Removed 907 row(s) containing missing values (geom_path).

if (params$bounded.growth.confidence.interval) conf = confint.merMod(REM_bounded, c("y0","yf","log_alpha"), method="profile")
if (params$bounded.growth.confidence.interval) {
  print(conf)
  print(exp(conf[3,]))
  print(paste0("Average boundary improvement over baseline: ", round((yf-y0)/y0*100, 1), " (", round((conf[1,1]-conf[2,1])/conf[2,1]*100, 1), "-", round((conf[1,2]-conf[2,2])/conf[2,2]*100, 1), ")"))
}

Bounded growth model

equ_diff_REM_bounded = function(t) exp(log_alpha)*(yf-y0)*exp(exp(log_alpha)*-t)
equ_diff_get_time_REM_bounded = function(target_slope) log(exp(log_alpha)*(yf-y0)/target_slope)/exp(log_alpha)

equ_bounded_get_x = function(target_value) exp(-log_alpha)*log((yf-y0)/(yf-target_value))

growth_percentiles = c(0, 0.5, 0.9)
names_percentiles = c("baseline", "half-practice point", "90% practice")
selected_timepoints = equ_bounded_get_x(y0+(yf-y0)*growth_percentiles)
example_slopes_bounded = data.frame(
  x=selected_timepoints,
  y=equ_bounded(selected_timepoints),
  label=paste0("y=", round(equ_bounded(selected_timepoints),1), ", m=", signif(equ_diff_REM_bounded(selected_timepoints),2), " at ", params$unit_time, " ", round(selected_timepoints,0), ", ", names_percentiles),
  vjust=1.5
)
example_slopes_bounded = rbind(example_slopes_bounded, list(x=0.83*xmax, y=yf, label=paste0("boundary: ", round(yf, 1)), vjust=-1.0))

if (params$bounded.growth.confidence.interval) ribbon = data.frame(x=seq(0,xmax,0.05), ymin=equ_bounded(seq(0,xmax,0.05), conf["yf","2.5 %"], conf["y0","2.5 %"], conf["log_alpha","2.5 %"]), ymax=equ_bounded(seq(0,xmax,0.05), conf["yf","97.5 %"], conf["y0","97.5 %"], conf["log_alpha","97.5 %"]))

quant = quantile(table(data$id))
print(paste0("n tests = ", nrow(data), " (n ", params$unit_n, "s = ", length(unique(data$id)), ", median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 17945 (n hands = 470, median tests per hand: 18, IQR 10-42.75)"
p1 = ggplot() + geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.2) +
  theme_pubr(base_family="Serif") + scale_x_continuous(limits = c(0,xmax), expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + xlab(params$xlab) + ylab(params$unit) +
  geom_vline(xintercept=example_slopes_bounded[2,"x"], color=linecolor, linetype=2) +
  stat_function(fun=equ_bounded, color=linecolor, size=1) +
  geom_point(data=example_slopes_bounded[1:(nrow(example_slopes_bounded)-1),], aes(x,y), color=linecolor, size=5) +
  geom_text(data=example_slopes_bounded, aes(x,y,label=label, vjust=vjust), color=linecolor, hjust=-0.01, family="Serif")

if (params$bounded.growth.confidence.interval) p1 = p1 + geom_ribbon(aes(x=x, ymin=ymin, ymax=ymax), ribbon, fill=linecolor, alpha=0.3)

p1
## Warning: Removed 423 row(s) containing missing values (geom_path).

Quantile regression

if (is.null(params$censor_after)) {
  censor_after = as.integer(round(selected_timepoints[2])) # half-practice point
} else {
  censor_after = params$censor_after
}
data_censored = data[data$predictor <= censor_after,]

percentiles = c(0.05,0.25,0.5,0.75,0.95)

QR = rq(value ~ predictor, tau=percentiles, data_censored)

p_vals = sapply(1:length(summary(QR)), function(i) {
  summ = coef(summary(QR, se="ker")[[i]])
  print(summ)
  print(paste0("Intercept: ", round(summ[1,1],1), " (", round(summ[1,1]-1.96*summ[1,2],1), "-", round(summ[1,1]+1.96*summ[1,2],1), "), beta: ", round(summ[2,1],2), " (", round(summ[2,1]-1.96*summ[2,2],2), "-", round(summ[2,1]+1.96*summ[2,2],2), ")"))
  summ[2,4]
})
##                 Value Std. Error   t value Pr(>|t|)
## (Intercept) 8.0000000  0.5299614 15.095438        0
## predictor   0.8618758  0.1018772  8.459951        0
## [1] "Intercept: 8.0 (7.0-9.0), beta: 0.86 (0.66-1.06)"
##                 Value Std. Error  t value Pr(>|t|)
## (Intercept) 21.000000 0.37657183 55.76625        0
## predictor    1.486519 0.08530197 17.42655        0
## [1] "Intercept: 21.0 (20.3-21.7), beta: 1.49 (1.32-1.65)"
##                 Value Std. Error  t value Pr(>|t|)
## (Intercept) 30.961627 0.39778612 77.83486        0
## predictor    1.655894 0.07527373 21.99830        0
## [1] "Intercept: 31.0 (30.2-31.7), beta: 1.66 (1.51-1.80)"
##                 Value Std. Error   t value Pr(>|t|)
## (Intercept) 42.281015 0.40426916 104.58630        0
## predictor    1.294093 0.07355677  17.59311        0
## [1] "Intercept: 42.3 (41.5-43.1), beta: 1.29 (1.15-1.44)"
##                  Value Std. Error   t value Pr(>|t|)
## (Intercept) 54.1962485 0.46417477 116.75828        0
## predictor    0.9677458 0.08751407  11.05817        0
## [1] "Intercept: 54.2 (53.3-55.1), beta: 0.97 (0.80-1.14)"
p_vals = p.adjust(p_vals, method="bonferroni")

ANOVA = anova(QR)

quant = quantile(table(data_censored$id))
print(paste0("n tests = ", nrow(data_censored), " (median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 7494 (median tests per hand: 12, IQR 7-24)"
signif_p = function(x, digits=1) {
  x = signif(x, digits)
  if (as.character(x) == "0") return("< 2e-16")
  else return(paste0("= ", x))
}

ggplot() + geom_line(aes_string("predictor", "value", group="id"), data_censored, alpha=0.2, color="darkgrey") + theme_pubr(base_family="Serif") + scale_x_continuous(expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + theme(legend.position = "none") + xlab(params$xlab) + ylab(params$unit) +
  geom_abline(intercept=coef(QR)[1,], slope=coef(QR)[2,], color=linecolor) +
  geom_text(data=data.frame(intercept=coef(QR)[1,], label=paste0(percentiles*100, "th percentile: β = ", round(coef(QR)[2,],1), ", ", "p.adj ", sapply(p_vals,signif_p))),
            mapping=aes(x=1,y=intercept, label=label), color=linecolor, hjust="left", vjust=1, family="Serif") +
  coord_cartesian(xlim=c(0,censor_after)) +
  geom_text(aes(x=x,y=y,label=label), data.frame(x=0.8*censor_after, y=0, label=paste0("ANOVA p ", signif_p(ANOVA$table$pvalue, 1))), vjust=-1.5, family="Serif") +
  geom_vline(xintercept=censor_after, color=linecolor, linetype=2)

Mobility

params = list(
  test_code = "two_min_walk",
  test_metric_code = "steps",
  unit = "Two Minute Walk: Steps",
  unit_n = "patient",
  unit_time = "week",
  min_repetitions = 5,
  min_weeks = 5,
  predictor = "weeksSinceFirst",
  xlab = "Weeks",
  bounded.growth.confidence.interval = F,
  censor_after = 11, # allow comparison with cognition
  up_to_date = "2021-05-01"
)
library(data.table) # fread
library(parsedate) # parse_date
library(dplyr) # group_by
library(tibble) # deframe
library(lme4) # lmer
library(mgcv) # gamm
library(quantreg) # rq
library(patchwork) # plot_layout
library(gridExtra) # grid.arrange
library(ggpubr) # ggscatter
library(ggtext) # geom_text
library(sjPlot) # plot_model

# Download from: https://dataset.floodlightopen.com/public-blobs-prod/complete_dataset.csv
data = fread("complete_dataset.csv", data.table=F)

# Prepare dataset
data = data[data$testCode == params$test_code & !data$participantIsControl,]
data$time = parse_date(data$testStartedAt)
data = data[data$time <= as.POSIXct(params$up_to_date, tz="UTC"),] # only analyse data up to (excluding) params$up_to_date
data = data[!duplicated(data),] # sometimes contains true duplicates for some reason (even with the same testResultMetricId)

# For "Finger Pinching" hand_used has to be determined
if (params$test_code == "pinching") {
  library(tidyr) # pivot_wider
  # just one means either "hand" or "successful_pinches" values are missing, remove those
  table(table(data$time))
  data = data[!data$time %in% names(which(table(data$time)==1)), ]
  data = as.data.frame(pivot_wider(data, id_cols=c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time"), names_from="testMetricCode", values_from="testResultMetricValue"))
} else {
  data = data[data$testMetricCode == params$test_metric_code,]
  data$hand_used = NA
  data = data[c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time", "testResultMetricValue", "hand_used")]
}

colnames(data) = c("id", "control", "sex", "birthyear", "time", "value", "hand_used")
data$age = year(data$time)-data$birthyear # Estimate age
data = data[order(as.character(data$id)),]

# 0 result values are discarded
data = data[!is.na(data$value) & data$value != 0,]

# Consider those supposedly younger than 18 (minimum study age) and older than 90 as NA
data$age[data$age < 18 | data$age > 90] = NA

data$id_original = data$id
data$id = paste0(data$id, "_hand", data$hand_used)

data$day = as.IDate(data$time)
round = function(x, digits=0) sprintf(paste0("%.", digits, "f"), x)

no_x = theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
no_y = theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())

linecolor = "#c71138"

Participant selection

# At least x weeks & repetitions
for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "repetition"] = (1:n)-1
  data[subset, "weeksSinceFirst"] = as.numeric(difftime(data[subset, "time"], data[subset, "time"][1], unit="weeks"))
}

n_orig = nrow(data)
n_patients_orig = length(unique(data$id_original))
n_hands_orig = length(unique(data$id))

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(weeksSinceFirst), repetitions=last(repetition), .groups="keep")

Among the total n=540 patients with n=14031 repetitions, the median length of participation is 0.9 weeks (IQR 0.0-11.2, range 0.0-133.5) and the median number of repetitions is 3 (IQR 1-14.25, range 1-735).

data = data[data$id %in% participation_duration$id[participation_duration$weeks >= params$min_weeks & participation_duration$repetitions+1 >= params$min_repetitions],]

for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "daysSinceLast"] = as.numeric(difftime(data[subset, "time"], c(data[subset, "time"][1], data[subset, "time"][1:(n-1)]), unit="days"))
}

participation_duration = data %>% group_by(id) %>% summarise(sex=first(sex), mean_age=mean(age), weeks=last(weeksSinceFirst), repetitions=last(repetition), median_intertest_interval=median(daysSinceLast), IQR_intertest_interval=IQR(daysSinceLast), .groups="keep")

data$predictor = data[,params$predictor]

Inclusion criteria: participation for at least 5 weeks and at least 5 repetitions performed per test, leading to the analysis of n=161 / 540 patients and n=12997 / 14031 tests. Among those, the median length of participation is 16.4 weeks (IQR 10.8-46.4, range 5.3-133.5) and the median number of repetitions is 37 (IQR 15-84, range 5-735).

t(data.frame(
  n_patients = paste0(length(unique(data$id_original)), " / ", n_patients_orig, " (", round(length(unique(data$id_original))/n_patients_orig*100,1), "%)"),
  n_hands = paste0(length(unique(data$id)), " / ", n_hands_orig, " (", round(length(unique(data$id))/n_hands_orig*100,1), "%)"),
  n_tests = paste0(nrow(data), " / ", n_orig, " (", round(nrow(data)/n_orig*100,1), "%)"),
  percent_female = paste0(round(prop.table(table(participation_duration$sex == "female"))[[2]]*100, 1)),
  age = paste0(round(median(participation_duration$mean_age,na.rm=T),1), " (", round(quantile(participation_duration$mean_age, 0.25, na.rm=T),1), "-", round(quantile(participation_duration$mean_age, 0.75, na.rm=T),1), ", range ", round(min(participation_duration$mean_age, na.rm=T),1), "-", round(max(participation_duration$mean_age, na.rm=T),1), ")"),
  repetitions = paste0(median(participation_duration$repetitions)+1, " repetitions (IQR ", quantile(participation_duration$repetitions+1, 0.25), "-", quantile(participation_duration$repetitions+1, 0.75), ", range ", min(participation_duration$repetitions+1), "-", max(participation_duration$repetitions+1), ")"),
  median_intertest_interval = paste0(round(median(participation_duration$median_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$median_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$median_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$median_intertest_interval),1), "-", round(max(participation_duration$median_intertest_interval),1), ")"),
  IQR_intertest_interval = paste0(round(median(participation_duration$IQR_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$IQR_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$IQR_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$IQR_intertest_interval),1), "-", round(max(participation_duration$IQR_intertest_interval),1), ")"),
  weeks = paste0(round(median(participation_duration$weeks),1), " weeks (IQR ", round(quantile(participation_duration$weeks, 0.25),1), "-", round(quantile(participation_duration$weeks, 0.75),1), ", range ", round(min(participation_duration$weeks),1), "-", round(max(participation_duration$weeks),1), ")")
))
##                           [,1]                                         
## n_patients                "161 / 540 (29.8%)"                          
## n_hands                   "161 / 540 (29.8%)"                          
## n_tests                   "12997 / 14031 (92.6%)"                      
## percent_female            "72.0"                                       
## age                       "50.0 (41.7-58.0, range 20.0-74.3)"          
## repetitions               "37 repetitions (IQR 15-84, range 5-735)"    
## median_intertest_interval "1.3 days (IQR 1.0-2.9, range 1.0-24.9)"     
## IQR_intertest_interval    "1.9 days (IQR 0.7-4.6, range 0.1-39.2)"     
## weeks                     "16.4 weeks (IQR 10.8-46.4, range 5.3-133.5)"

Summary level analysis

Difference test

df = as.data.frame(data %>% group_by(id) %>% summarise(first=first(value), last=last(value), mean=mean(value), weeksSinceFirst=max(weeksSinceFirst), repetition=n(), first_age=first(age), last_age=last(age), mean_age=mean(age), .groups="keep") %>% mutate(diff=last-first))

df$predictor = df[, params$predictor]

test = t.test(df$last, df$first, paired=T)
test
## 
##  Paired t-test
## 
## data:  df$last and df$first
## t = 0.61336, df = 160, p-value = 0.5405
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.922237  9.357020
## sample estimates:
## mean of the differences 
##                2.217391
mod0 = lm(diff ~ I(mean_age/10) + I(first/10), df)
summ0 = summary(mod0)
summ0
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -133.799  -19.684    6.243   23.605  141.065 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     53.5479    25.3623   2.111 0.036329 *  
## I(mean_age/10)   0.5238     3.3466   0.157 0.875817    
## I(first/10)     -2.6593     0.7265  -3.660 0.000344 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.24 on 157 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.08709,    Adjusted R-squared:  0.07546 
## F-statistic: 7.488 on 2 and 157 DF,  p-value: 0.0007829
mod = lm(diff ~ I(mean_age/10) + I(first/10) + log10(predictor), df)
confint(mod)
##                      2.5 %    97.5 %
## (Intercept)      -6.870781 95.059151
## I(mean_age/10)   -7.880426  5.756801
## I(first/10)      -4.319600 -1.425222
## log10(predictor) -2.471937 35.257452
summ = summary(mod)
summ
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10) + log10(predictor), 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -135.255  -21.276    4.785   24.222  132.313 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       44.0942    25.8013   1.709 0.089441 .  
## I(mean_age/10)    -1.0618     3.4520  -0.308 0.758799    
## I(first/10)       -2.8724     0.7326  -3.921 0.000132 ***
## log10(predictor)  16.3928     9.5504   1.716 0.088064 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43.97 on 156 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.104,  Adjusted R-squared:  0.08678 
## F-statistic: 6.036 on 3 and 156 DF,  p-value: 0.0006469
# additional variance explained by predictor
#print(summ$r.squared - summ0$r.squared)

print(paste0("Average observed improvement over baseline: ", round(test$estimate/mean(df$first)*100, 1), " (", round(test$conf[1]/mean(df$first)*100, 1), "-", round(test$conf[2]/mean(df$first)*100, 1), ")"))
## [1] "Average observed improvement over baseline: 1.1 (-2.4-4.6)"
lab.y = 1.1*mean(df$last)

p1 = ggbarplot(data.frame(Timepoint=rep(c("First","Mean","Last"),each=nrow(df)), value=c(df$first,df$mean,df$last)), "Timepoint", "value", add="mean_se", label=T, lab.nb.digits=1, lab.vjust=1.9, ylab=params$unit) + xlab("Score") #+ stat_compare_means(comparisons = list(c("First","Last")), paired=T, method="t.test", label.y=lab.y) + scale_y_continuous(expand=expansion(mult=c(0,0.1)))

p2 = plot_model(summ, show.values=T, vline.color = "grey", show.intercept=T, colors=linecolor, title=paste0("Difference from First to Last Score, R²=", round(summ$r.squared, 2)), axis.labels=rev(c("Intercept", "Age (per 10 years)", "First score (per 10)", paste0(params$xlab, " (log 10)"))), value.offset=0.3, show.p=F) + ylab("β estimates")

(p1 + p2) + plot_layout(widths=c(2,5)) & theme_pubr(base_family="Serif")

Confounders

p_age_first = ggscatter(df, "mean_age", "first", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("First score") + theme_pubr(base_family="Serif")

p_age_pred = ggscatter(df, "mean_age", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Mean age") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_age_last = ggscatter(df, "mean_age", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Last score") + theme_pubr(base_family="Serif")

p_age_diff = ggscatter(df, "mean_age", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_first_pred = ggscatter(df, "first", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("First score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_first_last = ggscatter(df, "first", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_first_diff = ggscatter(df, "first", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_pred_last = ggscatter(df, "predictor", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Last score") + theme_pubr(base_family="Serif")

p_pred_diff = ggscatter(df, "predictor", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_last_diff = ggscatter(df, "last", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Last score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_last_pred = ggscatter(df, "last", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Last score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")


p_age = gghistogram(df, "mean_age", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_first = gghistogram(df, "first", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_last = gghistogram(df, "last", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_pred = gghistogram(df, "predictor", bins=15) + scale_x_log10() + xlab(NULL) + theme_pubr(base_family="Serif")
p_diff = gghistogram(df, "diff", bins=15) + xlab("Difference first to last") + theme_pubr(base_family="Serif")

#(((p1+xlab(NULL)) + (p2+xlab(NULL)+ylab(NULL))) / ((p3+xlab(NULL)) + (p4+xlab(NULL)+ylab(NULL))) / ((p5) | (p6+ylab(NULL)))) & theme_pubr(base_family="Serif")

#(p_age_first | p_first) / (p_age_last | p_first_last | p_last) / (p_age_pred | p_first_pred | p_last_pred | p_pred) / (p_age_diff | p_first_diff | p_last_diff | p_pred_diff)

m <- matrix(NA, 5, 5)
m[lower.tri(m, diag = T)] <- 1:15
grid.arrange(grobs=list(
  p_age, p_age_first+xlab(NULL), p_age_pred+xlab(NULL), p_age_last+xlab(NULL), p_age_diff,
  p_first, p_first_pred+xlab(NULL)+ylab(""), p_first_last+xlab(NULL)+ylab(""), p_first_diff+ylab(""),
  p_pred, p_pred_last+xlab(NULL)+ylab(""), p_pred_diff+ylab(""),
  p_last, p_last_diff+ylab(""), 
  p_diff
), layout_matrix=m, heights=c(1,1,1,1,1.1))
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#GGally::ggpairs(df[c("mean_age", "first", "last", "predictor", "diff")])
#pairs(df[c("mean_age", "first", "last", "predictor", "diff")], upper.panel=NULL)
#corrplot::corrplot(cor(df[c("mean_age", "first", "last", "predictor", "diff")], use="complete.obs"))

Learning curve: Model selection

smoothing_spline = gamm(value ~ s(predictor, bs="ps"), random=list(id=~1), data=data)
summary(smoothing_spline$lme)
## Linear mixed-effects model fit by maximum likelihood
##  Data: strip.offset(mf) 
##        AIC      BIC    logLik
##   122130.2 122167.6 -61060.11
## 
## Random effects:
##  Formula: ~Xr - 1 | g
##  Structure: pdIdnot
##               Xr1       Xr2       Xr3       Xr4       Xr5       Xr6       Xr7
## StdDev: 0.8366092 0.8366092 0.8366092 0.8366092 0.8366092 0.8366092 0.8366092
##               Xr8
## StdDev: 0.8366092
## 
##  Formula: ~1 | id %in% g
##         (Intercept) Residual
## StdDev:    46.94198 25.74938
## 
## Fixed effects: y ~ X - 1 
##                     Value Std.Error    DF  t-value p-value
## X(Intercept)     204.9623  3.729856 12835 54.95180  0.0000
## Xs(predictor)Fx1 -14.8224  5.402353 12835 -2.74369  0.0061
##  Correlation: 
##                  X(Int)
## Xs(predictor)Fx1 0.018 
## 
## Standardized Within-Group Residuals:
##          Min           Q1          Med           Q3          Max 
## -11.29909193  -0.27808020   0.09336959   0.43268993   6.65718040 
## 
## Number of Observations: 12997
## Number of Groups: 
##         g id %in% g 
##         1       161
summary(smoothing_spline$gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## value ~ s(predictor, bs = "ps")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   204.96       3.73   54.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df     F  p-value    
## s(predictor) 4.12   4.12 5.468 0.000188 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  -0.00125   
##   Scale est. = 663.03    n = 12997
REM_linear = lmer(value ~ (1|id) + predictor, data)
equ_linear = function(t) fixef(REM_linear)[1] + fixef(REM_linear)[2]*t
summary(REM_linear)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor
##    Data: data
## 
## REML criterion at convergence: 122138.3
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -11.2946  -0.2765   0.0910   0.4311   6.6409 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 2217.0   47.08   
##  Residual              664.4   25.78   
## Number of obs: 12997, groups:  id, 161
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept) 205.026547   3.738503  54.842
## predictor    -0.004716   0.012205  -0.386
## 
## Correlation of Fixed Effects:
##           (Intr)
## predictor -0.043
REM_quadratic = lmer(value ~ (1|id) + predictor + I(predictor^2), data)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
equ_quadratic = function(t) fixef(REM_quadratic)[1] + fixef(REM_quadratic)[2]*t + fixef(REM_quadratic)[3]*t^2
summary(REM_quadratic)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor + I(predictor^2)
##    Data: data
## 
## REML criterion at convergence: 122141.5
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -11.3042  -0.2803   0.0921   0.4309   6.6902 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 2212.8   47.04   
##  Residual              663.9   25.77   
## Number of obs: 12997, groups:  id, 161
## 
## Fixed effects:
##                  Estimate Std. Error t value
## (Intercept)     2.044e+02  3.740e+00  54.638
## predictor       8.473e-02  2.940e-02   2.882
## I(predictor^2) -9.937e-04  2.972e-04  -3.344
## 
## Correlation of Fixed Effects:
##             (Intr) prdctr
## predictor   -0.066       
## I(prdctr^2)  0.053 -0.910
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
REM_bounded = nlmer(value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0|id) + (yf|id), data = data, start=c(yf=40, y0=20, log_alpha=-1))
## Warning in (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, :
## failure to converge in 10000 evaluations
y0=fixef(REM_bounded)["y0"]
yf=fixef(REM_bounded)["yf"]
log_alpha=fixef(REM_bounded)["log_alpha"]
equ_bounded = function(t, yf=fixef(REM_bounded)[["yf"]], y0=fixef(REM_bounded)[["y0"]], log_alpha=fixef(REM_bounded)[["log_alpha"]]) yf+(y0-yf)*exp(-exp(log_alpha)*t)
summary(REM_bounded)
## Nonlinear mixed model fit by maximum likelihood  ['nlmerMod']
## Formula: value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0 | id) + (yf |  
##     id)
##    Data: data
## 
##      AIC      BIC   logLik deviance df.resid 
## 120909.6 120954.4 -60448.8 120897.6    12991 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -12.1351  -0.2918   0.0746   0.4097   5.2339 
## 
## Random effects:
##  Groups   Name Variance Std.Dev.
##  id       y0   2482.1   49.82   
##  id.1     yf   2896.5   53.82   
##  Residual       583.5   24.15   
## Number of obs: 12997, groups:  id, 161
## 
## Fixed effects:
##            Estimate Std. Error t value
## yf        205.00202    0.06989 2933.30
## y0        205.37997    0.05922 3468.28
## log_alpha  -1.89334    0.08054  -23.51
## 
## Correlation of Fixed Effects:
##           yf     y0    
## y0        -0.008       
## log_alpha  0.000  0.000
## convergence code: 0
## failure to converge in 10000 evaluations
exp(log_alpha)
## log_alpha 
## 0.1505684
cat("Average improvement over baseline: ", (yf-y0)/y0*100)
## Average improvement over baseline:  -0.1840261
RMSE = sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) sqrt(mean(resid(mod)^2))) # RMSE
RMSE
## [1] 25.61632 25.60547 25.58810 23.86846
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) mean(abs(resid(mod)))) # MAE
## [1] 15.89991 15.89229 15.85936 14.69603
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) extractAIC(mod)) # edf & AIC: smoothing_spline$lme always has edf 5
##          [,1]     [,2]     [,3]     [,4]
## [1,]      4.0      5.0      5.0      6.0
## [2,] 122143.8 122134.6 122130.2 120909.6
edf = sapply(list(REM_linear, REM_quadratic, smoothing_spline$gam, REM_bounded), function(mod) { nrow(data)-df.residual(mod) }) # while smoothing_spline$gam often has much higher edf
edf
## [1] 4.000000 5.000000 5.119836 6.000000
RMSE = round(RMSE,1)
edf = round(edf,1)

Plot

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(predictor), .groups="keep") %>% deframe()
remaining_participants_bins = seq(0,350,by=10)
remaining_participants = data.frame(x=remaining_participants_bins, text=sapply(remaining_participants_bins, function(x) sum(participation_duration>=x)))

xmax = remaining_participants$x[which(remaining_participants$text<10)[1]]

range_90p =quantile(data$value, probs=c(0.05,0.95))

p1 = ggplot() +
  geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.1) +
  geom_line(aes(x,y), data.frame(x=0:xmax, y=predict(smoothing_spline$gam, newdata=data.frame(predictor=0:xmax))), linetype="longdash", size=1) + xlim(0,xmax) + ylim(range_90p[1], range_90p[2]) +
  stat_function(fun=equ_linear, color="blue", linetype="dotted", size=1) + 
  stat_function(fun=equ_quadratic, color="green4", linetype="dashed", size=1) + 
  stat_function(fun=equ_bounded, color=linecolor, size=1) + 
  theme_pubr(base_family="Serif") + no_x + xlab(NULL) + ylab(params$unit) +
  geom_richtext(aes(x,y,label=label,hjust=1), data.frame(x=0.8*xmax, y=range_90p[2]-(range_90p[2]-range_90p[1])/1.3, label=paste0("Model RMSE (edf):<br><span style='color:#0000ff'>····· Linear: ", RMSE[1], " (", edf[1], ") </span><br><span style='color:#008b00'>- - - Quadratic: ", RMSE[2], " (", edf[2], ") </span><br><span style='color:#000000'>— — Smoothing spline: ", RMSE[3], " (", edf[3], ") </span><br><span style='color:", linecolor, "'>— Bounded growth: ", RMSE[4], " (", edf[4], ") </span>")), family="Serif")

p2 = ggplot(remaining_participants[remaining_participants$x<=xmax,]) +
  geom_text(aes(x=x,y="A",label=text), family="Serif") +
  theme_pubr(base_family="Serif") + no_y + xlab(params$xlab) + ylab(paste0("Remaining \n ", params$unit_n, "s")) +
  scale_x_continuous(breaks=seq(0,xmax,by=10))

(p1 / p2) + plot_layout(heights=c(0.9,0.1))
## Warning: Removed 901 row(s) containing missing values (geom_path).

if (params$bounded.growth.confidence.interval) conf = confint.merMod(REM_bounded, c("y0","yf","log_alpha"), method="profile")
if (params$bounded.growth.confidence.interval) {
  print(conf)
  print(exp(conf[3,]))
  print(paste0("Average boundary improvement over baseline: ", round((yf-y0)/y0*100, 1), " (", round((conf[1,1]-conf[2,1])/conf[2,1]*100, 1), "-", round((conf[1,2]-conf[2,2])/conf[2,2]*100, 1), ")"))
}

Bounded growth model

equ_diff_REM_bounded = function(t) exp(log_alpha)*(yf-y0)*exp(exp(log_alpha)*-t)
equ_diff_get_time_REM_bounded = function(target_slope) log(exp(log_alpha)*(yf-y0)/target_slope)/exp(log_alpha)

equ_bounded_get_x = function(target_value) exp(-log_alpha)*log((yf-y0)/(yf-target_value))

growth_percentiles = c(0, 0.5, 0.9)
names_percentiles = c("baseline", "half-practice point", "90% practice")
selected_timepoints = equ_bounded_get_x(y0+(yf-y0)*growth_percentiles)
example_slopes_bounded = data.frame(
  x=selected_timepoints,
  y=equ_bounded(selected_timepoints),
  label=paste0("y=", round(equ_bounded(selected_timepoints),1), ", m=", signif(equ_diff_REM_bounded(selected_timepoints),2), " at ", params$unit_time, " ", round(selected_timepoints,0), ", ", names_percentiles),
  vjust=1.5
)
example_slopes_bounded = rbind(example_slopes_bounded, list(x=0.83*xmax, y=yf, label=paste0("boundary: ", round(yf, 1)), vjust=-1.0))

if (params$bounded.growth.confidence.interval) ribbon = data.frame(x=seq(0,xmax,0.05), ymin=equ_bounded(seq(0,xmax,0.05), conf["yf","2.5 %"], conf["y0","2.5 %"], conf["log_alpha","2.5 %"]), ymax=equ_bounded(seq(0,xmax,0.05), conf["yf","97.5 %"], conf["y0","97.5 %"], conf["log_alpha","97.5 %"]))

quant = quantile(table(data$id))
print(paste0("n tests = ", nrow(data), " (n ", params$unit_n, "s = ", length(unique(data$id)), ", median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 12997 (n patients = 161, median tests per patient: 37, IQR 15-84)"
p1 = ggplot() + geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.2) +
  theme_pubr(base_family="Serif") + scale_x_continuous(limits = c(0,xmax), expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + xlab(params$xlab) + ylab(params$unit) +
  geom_vline(xintercept=example_slopes_bounded[2,"x"], color=linecolor, linetype=2) +
  stat_function(fun=equ_bounded, color=linecolor, size=1) +
  geom_point(data=example_slopes_bounded[1:(nrow(example_slopes_bounded)-1),], aes(x,y), color=linecolor, size=5) +
  geom_text(data=example_slopes_bounded, aes(x,y,label=label, vjust=vjust), color=linecolor, hjust=-0.01, family="Serif")

if (params$bounded.growth.confidence.interval) p1 = p1 + geom_ribbon(aes(x=x, ymin=ymin, ymax=ymax), ribbon, fill=linecolor, alpha=0.3)

p1
## Warning: Removed 609 row(s) containing missing values (geom_path).

Quantile regression

if (is.null(params$censor_after)) {
  censor_after = as.integer(round(selected_timepoints[2])) # half-practice point
} else {
  censor_after = params$censor_after
}
data_censored = data[data$predictor <= censor_after,]

percentiles = c(0.05,0.25,0.5,0.75,0.95)

QR = rq(value ~ predictor, tau=percentiles, data_censored)

p_vals = sapply(1:length(summary(QR)), function(i) {
  summ = coef(summary(QR, se="ker")[[i]])
  print(summ)
  print(paste0("Intercept: ", round(summ[1,1],1), " (", round(summ[1,1]-1.96*summ[1,2],1), "-", round(summ[1,1]+1.96*summ[1,2],1), "), beta: ", round(summ[2,1],2), " (", round(summ[2,1]-1.96*summ[2,2],2), "-", round(summ[2,1]+1.96*summ[2,2],2), ")"))
  summ[2,4]
})
##                  Value Std. Error   t value  Pr(>|t|)
## (Intercept) 109.777468  4.2201757 26.012535 0.0000000
## predictor     1.234077  0.7734113  1.595628 0.1106345
## [1] "Intercept: 109.8 (101.5-118.0), beta: 1.23 (-0.28-2.75)"
##                  Value Std. Error   t value     Pr(>|t|)
## (Intercept) 185.758036  2.4733165 75.104838 0.000000e+00
## predictor     1.524471  0.3860884  3.948503 7.971279e-05
## [1] "Intercept: 185.8 (180.9-190.6), beta: 1.52 (0.77-2.28)"
##                   Value Std. Error    t value   Pr(>|t|)
## (Intercept) 223.0000000  1.1528150 193.439539 0.00000000
## predictor     0.3522387  0.1888365   1.865311 0.06219585
## [1] "Intercept: 223.0 (220.7-225.3), beta: 0.35 (-0.02-0.72)"
##                   Value Std. Error     t value  Pr(>|t|)
## (Intercept) 244.1444254  1.0236290 238.5087126 0.0000000
## predictor    -0.1129727  0.1729633  -0.6531601 0.5136828
## [1] "Intercept: 244.1 (242.1-246.2), beta: -0.11 (-0.45-0.23)"
##                   Value Std. Error  t value  Pr(>|t|)
## (Intercept) 266.6438817  1.3205451 201.9196 0.0000000
## predictor     0.5524386  0.2349802   2.3510 0.0187613
## [1] "Intercept: 266.6 (264.1-269.2), beta: 0.55 (0.09-1.01)"
p_vals = p.adjust(p_vals, method="bonferroni")

ANOVA = anova(QR)

quant = quantile(table(data_censored$id))
print(paste0("n tests = ", nrow(data_censored), " (median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 5046 (median tests per patient: 25, IQR 12-51)"
signif_p = function(x, digits=1) {
  x = signif(x, digits)
  if (as.character(x) == "0") return("< 2e-16")
  else return(paste0("= ", x))
}

ggplot() + geom_line(aes_string("predictor", "value", group="id"), data_censored, alpha=0.2, color="darkgrey") + theme_pubr(base_family="Serif") + scale_x_continuous(expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + theme(legend.position = "none") + xlab(params$xlab) + ylab(params$unit) +
  geom_abline(intercept=coef(QR)[1,], slope=coef(QR)[2,], color=linecolor) +
  geom_text(data=data.frame(intercept=coef(QR)[1,], label=paste0(percentiles*100, "th percentile: β = ", round(coef(QR)[2,],1), ", ", "p.adj ", sapply(p_vals,signif_p))),
            mapping=aes(x=1,y=intercept, label=label), color=linecolor, hjust="left", vjust=1, family="Serif") +
  coord_cartesian(xlim=c(0,censor_after)) +
  geom_text(aes(x=x,y=y,label=label), data.frame(x=0.8*censor_after, y=0, label=paste0("ANOVA p ", signif_p(ANOVA$table$pvalue, 1))), vjust=-1.5, family="Serif") +
  geom_vline(xintercept=censor_after, color=linecolor, linetype=2)

Sensitivity analysis 3

Performance as a function of weeks since first test Minimum number of repetitions: 10, minimum number of weeks: 10

Cognition

params = list(
  test_code = "ips",
  test_metric_code = "correct_responses",
  unit = "SDMT: Correct Responses",
  unit_n = "patient",
  unit_time = "week",
  min_repetitions = 10,
  min_weeks = 10,
  predictor = "weeksSinceFirst",
  xlab = "Weeks",
  bounded.growth.confidence.interval = F,
  up_to_date = "2021-05-01"
)
library(data.table) # fread
library(parsedate) # parse_date
library(dplyr) # group_by
library(tibble) # deframe
library(lme4) # lmer
library(mgcv) # gamm
library(quantreg) # rq
library(patchwork) # plot_layout
library(gridExtra) # grid.arrange
library(ggpubr) # ggscatter
library(ggtext) # geom_text
library(sjPlot) # plot_model

# Download from: https://dataset.floodlightopen.com/public-blobs-prod/complete_dataset.csv
data = fread("complete_dataset.csv", data.table=F)

# Prepare dataset
data = data[data$testCode == params$test_code & !data$participantIsControl,]
data$time = parse_date(data$testStartedAt)
data = data[data$time <= as.POSIXct(params$up_to_date, tz="UTC"),] # only analyse data up to (excluding) params$up_to_date
data = data[!duplicated(data),] # sometimes contains true duplicates for some reason (even with the same testResultMetricId)

# For "Finger Pinching" hand_used has to be determined
if (params$test_code == "pinching") {
  library(tidyr) # pivot_wider
  # just one means either "hand" or "successful_pinches" values are missing, remove those
  table(table(data$time))
  data = data[!data$time %in% names(which(table(data$time)==1)), ]
  data = as.data.frame(pivot_wider(data, id_cols=c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time"), names_from="testMetricCode", values_from="testResultMetricValue"))
} else {
  data = data[data$testMetricCode == params$test_metric_code,]
  data$hand_used = NA
  data = data[c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time", "testResultMetricValue", "hand_used")]
}

colnames(data) = c("id", "control", "sex", "birthyear", "time", "value", "hand_used")
data$age = year(data$time)-data$birthyear # Estimate age
data = data[order(as.character(data$id)),]

# 0 result values are discarded
data = data[!is.na(data$value) & data$value != 0,]

# Consider those supposedly younger than 18 (minimum study age) and older than 90 as NA
data$age[data$age < 18 | data$age > 90] = NA

data$id_original = data$id
data$id = paste0(data$id, "_hand", data$hand_used)

data$day = as.IDate(data$time)
round = function(x, digits=0) sprintf(paste0("%.", digits, "f"), x)

no_x = theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
no_y = theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())

linecolor = "#c71138"

Participant selection

# At least x weeks & repetitions
for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "repetition"] = (1:n)-1
  data[subset, "weeksSinceFirst"] = as.numeric(difftime(data[subset, "time"], data[subset, "time"][1], unit="weeks"))
}

n_orig = nrow(data)
n_patients_orig = length(unique(data$id_original))
n_hands_orig = length(unique(data$id))

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(weeksSinceFirst), repetitions=last(repetition), .groups="keep")

Among the total n=1095 patients with n=5715 repetitions, the median length of participation is 0.0 weeks (IQR 0.0-8.6, range 0.0-151.6) and the median number of repetitions is 1 (IQR 1-4, range 1-106).

data = data[data$id %in% participation_duration$id[participation_duration$weeks >= params$min_weeks & participation_duration$repetitions+1 >= params$min_repetitions],]

for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "daysSinceLast"] = as.numeric(difftime(data[subset, "time"], c(data[subset, "time"][1], data[subset, "time"][1:(n-1)]), unit="days"))
}

participation_duration = data %>% group_by(id) %>% summarise(sex=first(sex), mean_age=mean(age), weeks=last(weeksSinceFirst), repetitions=last(repetition), median_intertest_interval=median(daysSinceLast), IQR_intertest_interval=IQR(daysSinceLast), .groups="keep")

data$predictor = data[,params$predictor]

Inclusion criteria: participation for at least 10 weeks and at least 10 repetitions performed per test, leading to the analysis of n=135 / 1095 patients and n=3627 / 5715 tests. Among those, the median length of participation is 38.6 weeks (IQR 15.5-64.2, range 10.0-151.6) and the median number of repetitions is 17 (IQR 13.5-33, range 10-106).

t(data.frame(
  n_patients = paste0(length(unique(data$id_original)), " / ", n_patients_orig, " (", round(length(unique(data$id_original))/n_patients_orig*100,1), "%)"),
  n_hands = paste0(length(unique(data$id)), " / ", n_hands_orig, " (", round(length(unique(data$id))/n_hands_orig*100,1), "%)"),
  n_tests = paste0(nrow(data), " / ", n_orig, " (", round(nrow(data)/n_orig*100,1), "%)"),
  percent_female = paste0(round(prop.table(table(participation_duration$sex == "female"))[[2]]*100, 1)),
  age = paste0(round(median(participation_duration$mean_age,na.rm=T),1), " (", round(quantile(participation_duration$mean_age, 0.25, na.rm=T),1), "-", round(quantile(participation_duration$mean_age, 0.75, na.rm=T),1), ", range ", round(min(participation_duration$mean_age, na.rm=T),1), "-", round(max(participation_duration$mean_age, na.rm=T),1), ")"),
  repetitions = paste0(median(participation_duration$repetitions)+1, " repetitions (IQR ", quantile(participation_duration$repetitions+1, 0.25), "-", quantile(participation_duration$repetitions+1, 0.75), ", range ", min(participation_duration$repetitions+1), "-", max(participation_duration$repetitions+1), ")"),
  median_intertest_interval = paste0(round(median(participation_duration$median_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$median_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$median_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$median_intertest_interval),1), "-", round(max(participation_duration$median_intertest_interval),1), ")"),
  IQR_intertest_interval = paste0(round(median(participation_duration$IQR_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$IQR_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$IQR_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$IQR_intertest_interval),1), "-", round(max(participation_duration$IQR_intertest_interval),1), ")"),
  weeks = paste0(round(median(participation_duration$weeks),1), " weeks (IQR ", round(quantile(participation_duration$weeks, 0.25),1), "-", round(quantile(participation_duration$weeks, 0.75),1), ", range ", round(min(participation_duration$weeks),1), "-", round(max(participation_duration$weeks),1), ")")
))
##                           [,1]                                          
## n_patients                "135 / 1095 (12.3%)"                          
## n_hands                   "135 / 1095 (12.3%)"                          
## n_tests                   "3627 / 5715 (63.5%)"                         
## percent_female            "68.1"                                        
## age                       "51.1 (43.3-57.9, range 22.9-74.3)"           
## repetitions               "17 repetitions (IQR 13.5-33, range 10-106)"  
## median_intertest_interval "7.1 days (IQR 7.0-8.8, range 6.7-38.8)"      
## IQR_intertest_interval    "1.7 days (IQR 0.4-5.8, range 0.0-46.0)"      
## weeks                     "38.6 weeks (IQR 15.5-64.2, range 10.0-151.6)"

Summary level analysis

Difference test

df = as.data.frame(data %>% group_by(id) %>% summarise(first=first(value), last=last(value), mean=mean(value), weeksSinceFirst=max(weeksSinceFirst), repetition=n(), first_age=first(age), last_age=last(age), mean_age=mean(age), .groups="keep") %>% mutate(diff=last-first))

df$predictor = df[, params$predictor]

test = t.test(df$last, df$first, paired=T)
test
## 
##  Paired t-test
## 
## data:  df$last and df$first
## t = 19.262, df = 134, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  10.31587 12.67672
## sample estimates:
## mean of the differences 
##                 11.4963
mod0 = lm(diff ~ I(mean_age/10) + I(first/10), df)
summ0 = summary(mod0)
summ0
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.2315  -4.0548  -0.1762   3.9750  22.8922 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     24.3737     5.7055   4.272 3.70e-05 ***
## I(mean_age/10)  -0.2776     0.6852  -0.405    0.686    
## I(first/10)     -2.9466     0.7311  -4.030 9.39e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.469 on 131 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1492, Adjusted R-squared:  0.1362 
## F-statistic: 11.48 on 2 and 131 DF,  p-value: 2.541e-05
mod = lm(diff ~ I(mean_age/10) + I(first/10) + log10(predictor), df)
confint(mod)
##                       2.5 %     97.5 %
## (Intercept)       7.8164826 31.2316056
## I(mean_age/10)   -1.9166135  0.7805907
## I(first/10)      -4.4173566 -1.5802662
## log10(predictor)  0.9048721  7.6377443
summ = summary(mod)
summ
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10) + log10(predictor), 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.6562  -4.4544   0.2936   4.1619  21.2867 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       19.5240     5.9178   3.299  0.00125 ** 
## I(mean_age/10)    -0.5680     0.6817  -0.833  0.40622    
## I(first/10)       -2.9988     0.7170  -4.182 5.27e-05 ***
## log10(predictor)   4.2713     1.7016   2.510  0.01330 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.342 on 130 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1885, Adjusted R-squared:  0.1698 
## F-statistic: 10.07 on 3 and 130 DF,  p-value: 5.207e-06
# additional variance explained by predictor
#print(summ$r.squared - summ0$r.squared)

print(paste0("Average observed improvement over baseline: ", round(test$estimate/mean(df$first)*100, 1), " (", round(test$conf[1]/mean(df$first)*100, 1), "-", round(test$conf[2]/mean(df$first)*100, 1), ")"))
## [1] "Average observed improvement over baseline: 29.6 (26.5-32.6)"
lab.y = 1.1*mean(df$last)

p1 = ggbarplot(data.frame(Timepoint=rep(c("First","Mean","Last"),each=nrow(df)), value=c(df$first,df$mean,df$last)), "Timepoint", "value", add="mean_se", label=T, lab.nb.digits=1, lab.vjust=1.9, ylab=params$unit) + xlab("Score") #+ stat_compare_means(comparisons = list(c("First","Last")), paired=T, method="t.test", label.y=lab.y) + scale_y_continuous(expand=expansion(mult=c(0,0.1)))

p2 = plot_model(summ, show.values=T, vline.color = "grey", show.intercept=T, colors=linecolor, title=paste0("Difference from First to Last Score, R²=", round(summ$r.squared, 2)), axis.labels=rev(c("Intercept", "Age (per 10 years)", "First score (per 10)", paste0(params$xlab, " (log 10)"))), value.offset=0.3, show.p=F) + ylab("β estimates")

(p1 + p2) + plot_layout(widths=c(2,5)) & theme_pubr(base_family="Serif")

Confounders

p_age_first = ggscatter(df, "mean_age", "first", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("First score") + theme_pubr(base_family="Serif")

p_age_pred = ggscatter(df, "mean_age", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Mean age") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_age_last = ggscatter(df, "mean_age", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Last score") + theme_pubr(base_family="Serif")

p_age_diff = ggscatter(df, "mean_age", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_first_pred = ggscatter(df, "first", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("First score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_first_last = ggscatter(df, "first", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_first_diff = ggscatter(df, "first", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_pred_last = ggscatter(df, "predictor", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Last score") + theme_pubr(base_family="Serif")

p_pred_diff = ggscatter(df, "predictor", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_last_diff = ggscatter(df, "last", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Last score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_last_pred = ggscatter(df, "last", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Last score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")


p_age = gghistogram(df, "mean_age", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_first = gghistogram(df, "first", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_last = gghistogram(df, "last", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_pred = gghistogram(df, "predictor", bins=15) + scale_x_log10() + xlab(NULL) + theme_pubr(base_family="Serif")
p_diff = gghistogram(df, "diff", bins=15) + xlab("Difference first to last") + theme_pubr(base_family="Serif")

#(((p1+xlab(NULL)) + (p2+xlab(NULL)+ylab(NULL))) / ((p3+xlab(NULL)) + (p4+xlab(NULL)+ylab(NULL))) / ((p5) | (p6+ylab(NULL)))) & theme_pubr(base_family="Serif")

#(p_age_first | p_first) / (p_age_last | p_first_last | p_last) / (p_age_pred | p_first_pred | p_last_pred | p_pred) / (p_age_diff | p_first_diff | p_last_diff | p_pred_diff)

m <- matrix(NA, 5, 5)
m[lower.tri(m, diag = T)] <- 1:15
grid.arrange(grobs=list(
  p_age, p_age_first+xlab(NULL), p_age_pred+xlab(NULL), p_age_last+xlab(NULL), p_age_diff,
  p_first, p_first_pred+xlab(NULL)+ylab(""), p_first_last+xlab(NULL)+ylab(""), p_first_diff+ylab(""),
  p_pred, p_pred_last+xlab(NULL)+ylab(""), p_pred_diff+ylab(""),
  p_last, p_last_diff+ylab(""), 
  p_diff
), layout_matrix=m, heights=c(1,1,1,1,1.1))
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#GGally::ggpairs(df[c("mean_age", "first", "last", "predictor", "diff")])
#pairs(df[c("mean_age", "first", "last", "predictor", "diff")], upper.panel=NULL)
#corrplot::corrplot(cor(df[c("mean_age", "first", "last", "predictor", "diff")], use="complete.obs"))

Learning curve: Model selection

smoothing_spline = gamm(value ~ s(predictor, bs="ps"), random=list(id=~1), data=data)
summary(smoothing_spline$lme)
## Linear mixed-effects model fit by maximum likelihood
##  Data: strip.offset(mf) 
##        AIC      BIC    logLik
##   20328.17 20359.15 -10159.08
## 
## Random effects:
##  Formula: ~Xr - 1 | g
##  Structure: pdIdnot
##              Xr1      Xr2      Xr3      Xr4      Xr5      Xr6      Xr7      Xr8
## StdDev: 4.883366 4.883366 4.883366 4.883366 4.883366 4.883366 4.883366 4.883366
## 
##  Formula: ~1 | id %in% g
##         (Intercept) Residual
## StdDev:    9.164249 3.610145
## 
## Fixed effects: y ~ X - 1 
##                     Value Std.Error   DF  t-value p-value
## X(Intercept)     48.88064  0.792810 3491 61.65488       0
## Xs(predictor)Fx1 32.32925  7.101487 3491  4.55246       0
##  Correlation: 
##                  X(Int)
## Xs(predictor)Fx1 0.003 
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -7.74436405 -0.56631646  0.04682667  0.62582527  4.69089590 
## 
## Number of Observations: 3627
## Number of Groups: 
##         g id %in% g 
##         1       135
summary(smoothing_spline$gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## value ~ s(predictor, bs = "ps")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  48.8806     0.7927   61.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df     F p-value    
## s(predictor) 8.103  8.103 301.6  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0522   
##   Scale est. = 13.033    n = 3627
REM_linear = lmer(value ~ (1|id) + predictor, data)
equ_linear = function(t) fixef(REM_linear)[1] + fixef(REM_linear)[2]*t
summary(REM_linear)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor
##    Data: data
## 
## REML criterion at convergence: 20899.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.5763 -0.5595  0.0775  0.6347  4.4160 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 84.13    9.172   
##  Residual             15.56    3.945   
## Number of obs: 3627, groups:  id, 135
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) 45.067071   0.796124   56.61
## predictor    0.129663   0.003367   38.51
## 
## Correlation of Fixed Effects:
##           (Intr)
## predictor -0.083
REM_quadratic = lmer(value ~ (1|id) + predictor + I(predictor^2), data)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
equ_quadratic = function(t) fixef(REM_quadratic)[1] + fixef(REM_quadratic)[2]*t + fixef(REM_quadratic)[3]*t^2
summary(REM_quadratic)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor + I(predictor^2)
##    Data: data
## 
## REML criterion at convergence: 20760.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.8629 -0.5579  0.0582  0.6341  4.3695 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 83.76    9.152   
##  Residual             14.89    3.859   
## Number of obs: 3627, groups:  id, 135
## 
## Fixed effects:
##                  Estimate Std. Error t value
## (Intercept)     4.425e+01  7.967e-01   55.54
## predictor       2.170e-01  7.660e-03   28.32
## I(predictor^2) -9.939e-04  7.875e-05  -12.62
## 
## Correlation of Fixed Effects:
##             (Intr) prdctr
## predictor   -0.108       
## I(prdctr^2)  0.081 -0.903
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
REM_bounded = nlmer(value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0|id) + (yf|id), data = data, start=c(yf=40, y0=20, log_alpha=-1))
y0=fixef(REM_bounded)["y0"]
yf=fixef(REM_bounded)["yf"]
log_alpha=fixef(REM_bounded)["log_alpha"]
equ_bounded = function(t, yf=fixef(REM_bounded)[["yf"]], y0=fixef(REM_bounded)[["y0"]], log_alpha=fixef(REM_bounded)[["log_alpha"]]) yf+(y0-yf)*exp(-exp(log_alpha)*t)
summary(REM_bounded)
## Warning in vcov.merMod(object, use.hessian = use.hessian): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Warning in vcov.merMod(object, correlation = correlation, sigm = sig): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Nonlinear mixed model fit by maximum likelihood  ['nlmerMod']
## Formula: value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0 | id) + (yf |  
##     id)
##    Data: data
## 
##      AIC      BIC   logLik deviance df.resid 
##  20393.5  20430.6 -10190.7  20381.5     3621 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.8315 -0.5347  0.0532  0.6126  3.8736 
## 
## Random effects:
##  Groups   Name Variance Std.Dev.
##  id       y0    82.61    9.089  
##  id.1     yf   112.37   10.601  
##  Residual       12.22    3.496  
## Number of obs: 3627, groups:  id, 135
## 
## Fixed effects:
##           Estimate Std. Error t value
## yf        53.49272    0.97276   54.99
## y0        41.86880    0.80059   52.30
## log_alpha -2.80725    0.05128  -54.74
## 
## Correlation of Fixed Effects:
##           yf     y0    
## y0         0.003       
## log_alpha -0.267 -0.098
exp(log_alpha)
##  log_alpha 
## 0.06037096
cat("Average improvement over baseline: ", (yf-y0)/y0*100)
## Average improvement over baseline:  27.76272
RMSE = sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) sqrt(mean(resid(mod)^2))) # RMSE
RMSE
## [1] 3.870865 3.785758 3.538980 3.369193
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) mean(abs(resid(mod)))) # MAE
## [1] 2.935742 2.865599 2.658602 2.548282
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) extractAIC(mod)) # edf & AIC: smoothing_spline$lme always has edf 5
##         [,1]     [,2]     [,3]     [,4]
## [1,]     4.0     5.00     5.00     6.00
## [2,] 20899.1 20745.24 20328.17 20393.47
edf = sapply(list(REM_linear, REM_quadratic, smoothing_spline$gam, REM_bounded), function(mod) { nrow(data)-df.residual(mod) }) # while smoothing_spline$gam often has much higher edf
edf
## [1] 4.00000 5.00000 9.10292 6.00000
RMSE = round(RMSE,1)
edf = round(edf,1)

Plot

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(predictor), .groups="keep") %>% deframe()
remaining_participants_bins = seq(0,350,by=10)
remaining_participants = data.frame(x=remaining_participants_bins, text=sapply(remaining_participants_bins, function(x) sum(participation_duration>=x)))

xmax = remaining_participants$x[which(remaining_participants$text<10)[1]]

range_90p =quantile(data$value, probs=c(0.05,0.95))

p1 = ggplot() +
  geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.1) +
  geom_line(aes(x,y), data.frame(x=0:xmax, y=predict(smoothing_spline$gam, newdata=data.frame(predictor=0:xmax))), linetype="longdash", size=1) + xlim(0,xmax) + ylim(range_90p[1], range_90p[2]) +
  stat_function(fun=equ_linear, color="blue", linetype="dotted", size=1) + 
  stat_function(fun=equ_quadratic, color="green4", linetype="dashed", size=1) + 
  stat_function(fun=equ_bounded, color=linecolor, size=1) + 
  theme_pubr(base_family="Serif") + no_x + xlab(NULL) + ylab(params$unit) +
  geom_richtext(aes(x,y,label=label,hjust=1), data.frame(x=0.8*xmax, y=range_90p[2]-(range_90p[2]-range_90p[1])/1.3, label=paste0("Model RMSE (edf):<br><span style='color:#0000ff'>····· Linear: ", RMSE[1], " (", edf[1], ") </span><br><span style='color:#008b00'>- - - Quadratic: ", RMSE[2], " (", edf[2], ") </span><br><span style='color:#000000'>— — Smoothing spline: ", RMSE[3], " (", edf[3], ") </span><br><span style='color:", linecolor, "'>— Bounded growth: ", RMSE[4], " (", edf[4], ") </span>")), family="Serif")

p2 = ggplot(remaining_participants[remaining_participants$x<=xmax,]) +
  geom_text(aes(x=x,y="A",label=text), family="Serif") +
  theme_pubr(base_family="Serif") + no_y + xlab(params$xlab) + ylab(paste0("Remaining \n ", params$unit_n, "s")) +
  scale_x_continuous(breaks=seq(0,xmax,by=10))

(p1 / p2) + plot_layout(heights=c(0.9,0.1))
## Warning: Removed 263 row(s) containing missing values (geom_path).

if (params$bounded.growth.confidence.interval) conf = confint.merMod(REM_bounded, c("y0","yf","log_alpha"), method="profile")
if (params$bounded.growth.confidence.interval) {
  print(conf)
  print(exp(conf[3,]))
  print(paste0("Average boundary improvement over baseline: ", round((yf-y0)/y0*100, 1), " (", round((conf[1,1]-conf[2,1])/conf[2,1]*100, 1), "-", round((conf[1,2]-conf[2,2])/conf[2,2]*100, 1), ")"))
}

Bounded growth model

equ_diff_REM_bounded = function(t) exp(log_alpha)*(yf-y0)*exp(exp(log_alpha)*-t)
equ_diff_get_time_REM_bounded = function(target_slope) log(exp(log_alpha)*(yf-y0)/target_slope)/exp(log_alpha)

equ_bounded_get_x = function(target_value) exp(-log_alpha)*log((yf-y0)/(yf-target_value))

growth_percentiles = c(0, 0.5, 0.9)
names_percentiles = c("baseline", "half-practice point", "90% practice")
selected_timepoints = equ_bounded_get_x(y0+(yf-y0)*growth_percentiles)
example_slopes_bounded = data.frame(
  x=selected_timepoints,
  y=equ_bounded(selected_timepoints),
  label=paste0("y=", round(equ_bounded(selected_timepoints),1), ", m=", signif(equ_diff_REM_bounded(selected_timepoints),2), " at ", params$unit_time, " ", round(selected_timepoints,0), ", ", names_percentiles),
  vjust=1.5
)
example_slopes_bounded = rbind(example_slopes_bounded, list(x=0.83*xmax, y=yf, label=paste0("boundary: ", round(yf, 1)), vjust=-1.0))

if (params$bounded.growth.confidence.interval) ribbon = data.frame(x=seq(0,xmax,0.05), ymin=equ_bounded(seq(0,xmax,0.05), conf["yf","2.5 %"], conf["y0","2.5 %"], conf["log_alpha","2.5 %"]), ymax=equ_bounded(seq(0,xmax,0.05), conf["yf","97.5 %"], conf["y0","97.5 %"], conf["log_alpha","97.5 %"]))

quant = quantile(table(data$id))
print(paste0("n tests = ", nrow(data), " (n ", params$unit_n, "s = ", length(unique(data$id)), ", median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 3627 (n patients = 135, median tests per patient: 17, IQR 13.5-33)"
p1 = ggplot() + geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.2) +
  theme_pubr(base_family="Serif") + scale_x_continuous(limits = c(0,xmax), expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + xlab(params$xlab) + ylab(params$unit) +
  geom_vline(xintercept=example_slopes_bounded[2,"x"], color=linecolor, linetype=2) +
  stat_function(fun=equ_bounded, color=linecolor, size=1) +
  geom_point(data=example_slopes_bounded[1:(nrow(example_slopes_bounded)-1),], aes(x,y), color=linecolor, size=5) +
  geom_text(data=example_slopes_bounded, aes(x,y,label=label, vjust=vjust), color=linecolor, hjust=-0.01, family="Serif")

if (params$bounded.growth.confidence.interval) p1 = p1 + geom_ribbon(aes(x=x, ymin=ymin, ymax=ymax), ribbon, fill=linecolor, alpha=0.3)

p1
## Warning: Removed 110 row(s) containing missing values (geom_path).

Quantile regression

if (is.null(params$censor_after)) {
  censor_after = as.integer(round(selected_timepoints[2])) # half-practice point
} else {
  censor_after = params$censor_after
}
data_censored = data[data$predictor <= censor_after,]

percentiles = c(0.05,0.25,0.5,0.75,0.95)

QR = rq(value ~ predictor, tau=percentiles, data_censored)

p_vals = sapply(1:length(summary(QR)), function(i) {
  summ = coef(summary(QR, se="ker")[[i]])
  print(summ)
  print(paste0("Intercept: ", round(summ[1,1],1), " (", round(summ[1,1]-1.96*summ[1,2],1), "-", round(summ[1,1]+1.96*summ[1,2],1), "), beta: ", round(summ[2,1],2), " (", round(summ[2,1]-1.96*summ[2,2],2), "-", round(summ[2,1]+1.96*summ[2,2],2), ")"))
  summ[2,4]
})
##                  Value Std. Error   t value     Pr(>|t|)
## (Intercept) 27.0000000  0.7899768 34.178218 0.000000e+00
## predictor    0.8000419  0.1231026  6.498982 1.164673e-10
## [1] "Intercept: 27.0 (25.5-28.5), beta: 0.80 (0.56-1.04)"
##                  Value Std. Error   t value     Pr(>|t|)
## (Intercept) 36.1210791  0.6056317 59.641988 0.000000e+00
## predictor    0.6787753  0.1032713  6.572742 7.231238e-11
## [1] "Intercept: 36.1 (34.9-37.3), beta: 0.68 (0.48-0.88)"
##                  Value Std. Error   t value     Pr(>|t|)
## (Intercept) 41.6096034  0.6102834 68.180783 0.000000e+00
## predictor    0.7984169  0.1055003  7.567908 7.327472e-14
## [1] "Intercept: 41.6 (40.4-42.8), beta: 0.80 (0.59-1.01)"
##                  Value Std. Error   t value    Pr(>|t|)
## (Intercept) 48.0000000  0.6892900 69.636874 0.00000e+00
## predictor    0.7748156  0.1177457  6.580413 6.87963e-11
## [1] "Intercept: 48.0 (46.6-49.4), beta: 0.77 (0.54-1.01)"
##                  Value Std. Error   t value     Pr(>|t|)
## (Intercept) 56.0000000  0.9022525 62.066884 0.000000e+00
## predictor    0.8549147  0.1679645  5.089854 4.131299e-07
## [1] "Intercept: 56.0 (54.2-57.8), beta: 0.85 (0.53-1.18)"
p_vals = p.adjust(p_vals, method="bonferroni")

ANOVA = anova(QR)

quant = quantile(table(data_censored$id))
print(paste0("n tests = ", nrow(data_censored), " (median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 1254 (median tests per patient: 10, IQR 8-11)"
signif_p = function(x, digits=1) {
  x = signif(x, digits)
  if (as.character(x) == "0") return("< 2e-16")
  else return(paste0("= ", x))
}

ggplot() + geom_line(aes_string("predictor", "value", group="id"), data_censored, alpha=0.2, color="darkgrey") + theme_pubr(base_family="Serif") + scale_x_continuous(expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + theme(legend.position = "none") + xlab(params$xlab) + ylab(params$unit) +
  geom_abline(intercept=coef(QR)[1,], slope=coef(QR)[2,], color=linecolor) +
  geom_text(data=data.frame(intercept=coef(QR)[1,], label=paste0(percentiles*100, "th percentile: β = ", round(coef(QR)[2,],1), ", ", "p.adj ", sapply(p_vals,signif_p))),
            mapping=aes(x=1,y=intercept, label=label), color=linecolor, hjust="left", vjust=1, family="Serif") +
  coord_cartesian(xlim=c(0,censor_after)) +
  geom_text(aes(x=x,y=y,label=label), data.frame(x=0.8*censor_after, y=0, label=paste0("ANOVA p ", signif_p(ANOVA$table$pvalue, 1))), vjust=-1.5, family="Serif") +
  geom_vline(xintercept=censor_after, color=linecolor, linetype=2)

Dexterity

params = list(
  test_code = "pinching",
  test_metric_code = "successful_pinches",
  unit = "Pinching: Successful Pinches",
  unit_n = "hand",
  unit_time = "week",
  min_repetitions = 10,
  min_weeks = 10,
  predictor = "weeksSinceFirst",
  xlab = "Weeks",
  bounded.growth.confidence.interval = F,
  up_to_date = "2021-05-01"
)
library(data.table) # fread
library(parsedate) # parse_date
library(dplyr) # group_by
library(tibble) # deframe
library(lme4) # lmer
library(mgcv) # gamm
library(quantreg) # rq
library(patchwork) # plot_layout
library(gridExtra) # grid.arrange
library(ggpubr) # ggscatter
library(ggtext) # geom_text
library(sjPlot) # plot_model

# Download from: https://dataset.floodlightopen.com/public-blobs-prod/complete_dataset.csv
data = fread("complete_dataset.csv", data.table=F)

# Prepare dataset
data = data[data$testCode == params$test_code & !data$participantIsControl,]
data$time = parse_date(data$testStartedAt)
data = data[data$time <= as.POSIXct(params$up_to_date, tz="UTC"),] # only analyse data up to (excluding) params$up_to_date
data = data[!duplicated(data),] # sometimes contains true duplicates for some reason (even with the same testResultMetricId)

# For "Finger Pinching" hand_used has to be determined
if (params$test_code == "pinching") {
  library(tidyr) # pivot_wider
  # just one means either "hand" or "successful_pinches" values are missing, remove those
  table(table(data$time))
  data = data[!data$time %in% names(which(table(data$time)==1)), ]
  data = as.data.frame(pivot_wider(data, id_cols=c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time"), names_from="testMetricCode", values_from="testResultMetricValue"))
} else {
  data = data[data$testMetricCode == params$test_metric_code,]
  data$hand_used = NA
  data = data[c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time", "testResultMetricValue", "hand_used")]
}

colnames(data) = c("id", "control", "sex", "birthyear", "time", "value", "hand_used")
data$age = year(data$time)-data$birthyear # Estimate age
data = data[order(as.character(data$id)),]

# 0 result values are discarded
data = data[!is.na(data$value) & data$value != 0,]

# Consider those supposedly younger than 18 (minimum study age) and older than 90 as NA
data$age[data$age < 18 | data$age > 90] = NA

data$id_original = data$id
data$id = paste0(data$id, "_hand", data$hand_used)

data$day = as.IDate(data$time)
round = function(x, digits=0) sprintf(paste0("%.", digits, "f"), x)

no_x = theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
no_y = theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())

linecolor = "#c71138"

Participant selection

# At least x weeks & repetitions
for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "repetition"] = (1:n)-1
  data[subset, "weeksSinceFirst"] = as.numeric(difftime(data[subset, "time"], data[subset, "time"][1], unit="weeks"))
}

n_orig = nrow(data)
n_patients_orig = length(unique(data$id_original))
n_hands_orig = length(unique(data$id))

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(weeksSinceFirst), repetitions=last(repetition), .groups="keep")

Among the total n=1816 patients with n=20695 repetitions, the median length of participation is 0.6 weeks (IQR 0.0-8.1, range 0.0-151.7) and the median number of repetitions is 2 (IQR 1-7, range 1-370).

data = data[data$id %in% participation_duration$id[participation_duration$weeks >= params$min_weeks & participation_duration$repetitions+1 >= params$min_repetitions],]

for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "daysSinceLast"] = as.numeric(difftime(data[subset, "time"], c(data[subset, "time"][1], data[subset, "time"][1:(n-1)]), unit="days"))
}

participation_duration = data %>% group_by(id) %>% summarise(sex=first(sex), mean_age=mean(age), weeks=last(weeksSinceFirst), repetitions=last(repetition), median_intertest_interval=median(daysSinceLast), IQR_intertest_interval=IQR(daysSinceLast), .groups="keep")

data$predictor = data[,params$predictor]

Inclusion criteria: participation for at least 10 weeks and at least 10 repetitions performed per test, leading to the analysis of n=150 / 1059 patients , 289 / 1816 hands and n=16107 / 20695 tests. Among those, the median length of participation is 28.2 weeks (IQR 15.3-57.9, range 10.0-151.7) and the median number of repetitions is 35 (IQR 18-59, range 10-370).

t(data.frame(
  n_patients = paste0(length(unique(data$id_original)), " / ", n_patients_orig, " (", round(length(unique(data$id_original))/n_patients_orig*100,1), "%)"),
  n_hands = paste0(length(unique(data$id)), " / ", n_hands_orig, " (", round(length(unique(data$id))/n_hands_orig*100,1), "%)"),
  n_tests = paste0(nrow(data), " / ", n_orig, " (", round(nrow(data)/n_orig*100,1), "%)"),
  percent_female = paste0(round(prop.table(table(participation_duration$sex == "female"))[[2]]*100, 1)),
  age = paste0(round(median(participation_duration$mean_age,na.rm=T),1), " (", round(quantile(participation_duration$mean_age, 0.25, na.rm=T),1), "-", round(quantile(participation_duration$mean_age, 0.75, na.rm=T),1), ", range ", round(min(participation_duration$mean_age, na.rm=T),1), "-", round(max(participation_duration$mean_age, na.rm=T),1), ")"),
  repetitions = paste0(median(participation_duration$repetitions)+1, " repetitions (IQR ", quantile(participation_duration$repetitions+1, 0.25), "-", quantile(participation_duration$repetitions+1, 0.75), ", range ", min(participation_duration$repetitions+1), "-", max(participation_duration$repetitions+1), ")"),
  median_intertest_interval = paste0(round(median(participation_duration$median_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$median_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$median_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$median_intertest_interval),1), "-", round(max(participation_duration$median_intertest_interval),1), ")"),
  IQR_intertest_interval = paste0(round(median(participation_duration$IQR_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$IQR_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$IQR_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$IQR_intertest_interval),1), "-", round(max(participation_duration$IQR_intertest_interval),1), ")"),
  weeks = paste0(round(median(participation_duration$weeks),1), " weeks (IQR ", round(quantile(participation_duration$weeks, 0.25),1), "-", round(quantile(participation_duration$weeks, 0.75),1), ", range ", round(min(participation_duration$weeks),1), "-", round(max(participation_duration$weeks),1), ")")
))
##                           [,1]                                          
## n_patients                "150 / 1059 (14.2%)"                          
## n_hands                   "289 / 1816 (15.9%)"                          
## n_tests                   "16107 / 20695 (77.8%)"                       
## percent_female            "70.2"                                        
## age                       "51.1 (43.4-58.1, range 20.0-74.4)"           
## repetitions               "35 repetitions (IQR 18-59, range 10-370)"    
## median_intertest_interval "2.9 days (IQR 2.1-4.2, range 1.9-37.0)"      
## IQR_intertest_interval    "2.2 days (IQR 0.7-5.6, range 0.0-61.1)"      
## weeks                     "28.2 weeks (IQR 15.3-57.9, range 10.0-151.7)"

Summary level analysis

Difference test

df = as.data.frame(data %>% group_by(id) %>% summarise(first=first(value), last=last(value), mean=mean(value), weeksSinceFirst=max(weeksSinceFirst), repetition=n(), first_age=first(age), last_age=last(age), mean_age=mean(age), .groups="keep") %>% mutate(diff=last-first))

df$predictor = df[, params$predictor]

test = t.test(df$last, df$first, paired=T)
test
## 
##  Paired t-test
## 
## data:  df$last and df$first
## t = 20.174, df = 288, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  16.32819 19.85866
## sample estimates:
## mean of the differences 
##                18.09343
mod0 = lm(diff ~ I(mean_age/10) + I(first/10), df)
summ0 = summary(mod0)
summ0
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.974  -9.201   0.863  10.347  28.446 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     43.1418     4.8569   8.883  < 2e-16 ***
## I(mean_age/10)  -2.1949     0.8066  -2.721  0.00691 ** 
## I(first/10)     -5.4520     0.6530  -8.349 3.07e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.73 on 284 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1978, Adjusted R-squared:  0.1922 
## F-statistic: 35.02 on 2 and 284 DF,  p-value: 2.542e-14
mod = lm(diff ~ I(mean_age/10) + I(first/10) + log10(predictor), df)
confint(mod)
##                       2.5 %    97.5 %
## (Intercept)      26.0361124 47.706577
## I(mean_age/10)   -4.2353462 -1.005830
## I(first/10)      -6.8760902 -4.314358
## log10(predictor)  0.9787927 10.921859
summ = summary(mod)
summ
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10) + log10(predictor), 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.478  -8.914   1.315   9.968  28.278 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       36.8713     5.5046   6.698 1.14e-10 ***
## I(mean_age/10)    -2.6206     0.8203  -3.194  0.00156 ** 
## I(first/10)       -5.5952     0.6507  -8.599 5.60e-16 ***
## log10(predictor)   5.9503     2.5257   2.356  0.01916 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.62 on 283 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.2133, Adjusted R-squared:  0.2049 
## F-statistic: 25.57 on 3 and 283 DF,  p-value: 1.144e-14
# additional variance explained by predictor
#print(summ$r.squared - summ0$r.squared)

print(paste0("Average observed improvement over baseline: ", round(test$estimate/mean(df$first)*100, 1), " (", round(test$conf[1]/mean(df$first)*100, 1), "-", round(test$conf[2]/mean(df$first)*100, 1), ")"))
## [1] "Average observed improvement over baseline: 70.5 (63.6-77.4)"
lab.y = 1.1*mean(df$last)

p1 = ggbarplot(data.frame(Timepoint=rep(c("First","Mean","Last"),each=nrow(df)), value=c(df$first,df$mean,df$last)), "Timepoint", "value", add="mean_se", label=T, lab.nb.digits=1, lab.vjust=1.9, ylab=params$unit) + xlab("Score") #+ stat_compare_means(comparisons = list(c("First","Last")), paired=T, method="t.test", label.y=lab.y) + scale_y_continuous(expand=expansion(mult=c(0,0.1)))

p2 = plot_model(summ, show.values=T, vline.color = "grey", show.intercept=T, colors=linecolor, title=paste0("Difference from First to Last Score, R²=", round(summ$r.squared, 2)), axis.labels=rev(c("Intercept", "Age (per 10 years)", "First score (per 10)", paste0(params$xlab, " (log 10)"))), value.offset=0.3, show.p=F) + ylab("β estimates")

(p1 + p2) + plot_layout(widths=c(2,5)) & theme_pubr(base_family="Serif")

Confounders

p_age_first = ggscatter(df, "mean_age", "first", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("First score") + theme_pubr(base_family="Serif")

p_age_pred = ggscatter(df, "mean_age", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Mean age") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_age_last = ggscatter(df, "mean_age", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Last score") + theme_pubr(base_family="Serif")

p_age_diff = ggscatter(df, "mean_age", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_first_pred = ggscatter(df, "first", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("First score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_first_last = ggscatter(df, "first", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_first_diff = ggscatter(df, "first", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_pred_last = ggscatter(df, "predictor", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Last score") + theme_pubr(base_family="Serif")

p_pred_diff = ggscatter(df, "predictor", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_last_diff = ggscatter(df, "last", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Last score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_last_pred = ggscatter(df, "last", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Last score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")


p_age = gghistogram(df, "mean_age", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_first = gghistogram(df, "first", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_last = gghistogram(df, "last", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_pred = gghistogram(df, "predictor", bins=15) + scale_x_log10() + xlab(NULL) + theme_pubr(base_family="Serif")
p_diff = gghistogram(df, "diff", bins=15) + xlab("Difference first to last") + theme_pubr(base_family="Serif")

#(((p1+xlab(NULL)) + (p2+xlab(NULL)+ylab(NULL))) / ((p3+xlab(NULL)) + (p4+xlab(NULL)+ylab(NULL))) / ((p5) | (p6+ylab(NULL)))) & theme_pubr(base_family="Serif")

#(p_age_first | p_first) / (p_age_last | p_first_last | p_last) / (p_age_pred | p_first_pred | p_last_pred | p_pred) / (p_age_diff | p_first_diff | p_last_diff | p_pred_diff)

m <- matrix(NA, 5, 5)
m[lower.tri(m, diag = T)] <- 1:15
grid.arrange(grobs=list(
  p_age, p_age_first+xlab(NULL), p_age_pred+xlab(NULL), p_age_last+xlab(NULL), p_age_diff,
  p_first, p_first_pred+xlab(NULL)+ylab(""), p_first_last+xlab(NULL)+ylab(""), p_first_diff+ylab(""),
  p_pred, p_pred_last+xlab(NULL)+ylab(""), p_pred_diff+ylab(""),
  p_last, p_last_diff+ylab(""), 
  p_diff
), layout_matrix=m, heights=c(1,1,1,1,1.1))
## Warning: Removed 2 rows containing non-finite values (stat_bin).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing non-finite values (stat_cor).
## Warning: Removed 2 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#GGally::ggpairs(df[c("mean_age", "first", "last", "predictor", "diff")])
#pairs(df[c("mean_age", "first", "last", "predictor", "diff")], upper.panel=NULL)
#corrplot::corrplot(cor(df[c("mean_age", "first", "last", "predictor", "diff")], use="complete.obs"))

Learning curve: Model selection

smoothing_spline = gamm(value ~ s(predictor, bs="ps"), random=list(id=~1), data=data)
summary(smoothing_spline$lme)
## Linear mixed-effects model fit by maximum likelihood
##  Data: strip.offset(mf) 
##        AIC      BIC    logLik
##   112961.9 113000.3 -56475.95
## 
## Random effects:
##  Formula: ~Xr - 1 | g
##  Structure: pdIdnot
##              Xr1      Xr2      Xr3      Xr4      Xr5      Xr6      Xr7      Xr8
## StdDev: 9.391351 9.391351 9.391351 9.391351 9.391351 9.391351 9.391351 9.391351
## 
##  Formula: ~1 | id %in% g
##         (Intercept) Residual
## StdDev:    12.91174 7.721774
## 
## Fixed effects: y ~ X - 1 
##                     Value Std.Error    DF  t-value p-value
## X(Intercept)     42.13984  0.765822 15817 55.02564  0.0000
## Xs(predictor)Fx1 32.00912  9.779939 15817  3.27294  0.0011
##  Correlation: 
##                  X(Int)
## Xs(predictor)Fx1 0.003 
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -6.11150323 -0.48681468  0.07998043  0.59534358  6.75775923 
## 
## Number of Observations: 16107
## Number of Groups: 
##         g id %in% g 
##         1       289
summary(smoothing_spline$gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## value ~ s(predictor, bs = "ps")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  42.1398     0.7658   55.03   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df     F p-value    
## s(predictor) 8.54   8.54 546.3  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.12   
##   Scale est. = 59.626    n = 16107
REM_linear = lmer(value ~ (1|id) + predictor, data)
equ_linear = function(t) fixef(REM_linear)[1] + fixef(REM_linear)[2]*t
summary(REM_linear)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor
##    Data: data
## 
## REML criterion at convergence: 114437.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.1817 -0.4964  0.0935  0.6197  5.7342 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 171.73   13.104  
##  Residual              65.66    8.103  
## Number of obs: 16107, groups:  id, 289
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) 36.442439   0.778619   46.80
## predictor    0.189734   0.003587   52.89
## 
## Correlation of Fixed Effects:
##           (Intr)
## predictor -0.076
REM_quadratic = lmer(value ~ (1|id) + predictor + I(predictor^2), data)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
equ_quadratic = function(t) fixef(REM_quadratic)[1] + fixef(REM_quadratic)[2]*t + fixef(REM_quadratic)[3]*t^2
summary(REM_quadratic)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor + I(predictor^2)
##    Data: data
## 
## REML criterion at convergence: 114135
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.1051 -0.4867  0.0857  0.6098  6.0032 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 169.70   13.027  
##  Residual              64.37    8.023  
## Number of obs: 16107, groups:  id, 289
## 
## Fixed effects:
##                  Estimate Std. Error t value
## (Intercept)    35.4586841  0.7758746   45.70
## predictor       0.3136396  0.0077570   40.43
## I(predictor^2) -0.0014301  0.0000796  -17.97
## 
## Correlation of Fixed Effects:
##             (Intr) prdctr
## predictor   -0.097       
## I(prdctr^2)  0.071 -0.889
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
REM_bounded = nlmer(value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0|id) + (yf|id), data = data, start=c(yf=40, y0=20, log_alpha=-1))
y0=fixef(REM_bounded)["y0"]
yf=fixef(REM_bounded)["yf"]
log_alpha=fixef(REM_bounded)["log_alpha"]
equ_bounded = function(t, yf=fixef(REM_bounded)[["yf"]], y0=fixef(REM_bounded)[["y0"]], log_alpha=fixef(REM_bounded)[["log_alpha"]]) yf+(y0-yf)*exp(-exp(log_alpha)*t)
summary(REM_bounded)
## Nonlinear mixed model fit by maximum likelihood  ['nlmerMod']
## Formula: value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0 | id) + (yf |  
##     id)
##    Data: data
## 
##      AIC      BIC   logLik deviance df.resid 
## 110640.4 110686.6 -55314.2 110628.4    16101 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.8636 -0.4894  0.0816  0.6114  5.2337 
## 
## Random effects:
##  Groups   Name Variance Std.Dev.
##  id       y0   175.22   13.237  
##  id.1     yf   294.19   17.152  
##  Residual       49.32    7.023  
## Number of obs: 16107, groups:  id, 289
## 
## Fixed effects:
##           Estimate Std. Error t value
## yf         49.0349     1.0970   44.70
## y0         32.0651     0.7988   40.14
## log_alpha  -2.7603     0.0457  -60.41
## 
## Correlation of Fixed Effects:
##           yf     y0    
## y0         0.013       
## log_alpha -0.319 -0.116
exp(log_alpha)
##  log_alpha 
## 0.06326945
cat("Average improvement over baseline: ", (yf-y0)/y0*100)
## Average improvement over baseline:  52.92288
RMSE = sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) sqrt(mean(resid(mod)^2))) # RMSE
RMSE
## [1] 8.030923 7.951052 7.651202 6.901903
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) mean(abs(resid(mod)))) # MAE
## [1] 5.950272 5.865388 5.627759 5.094504
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) extractAIC(mod)) # edf & AIC: smoothing_spline$lme always has edf 5
##          [,1]     [,2]     [,3]     [,4]
## [1,]      4.0      5.0      5.0      6.0
## [2,] 114437.5 114119.8 112961.9 110640.4
edf = sapply(list(REM_linear, REM_quadratic, smoothing_spline$gam, REM_bounded), function(mod) { nrow(data)-df.residual(mod) }) # while smoothing_spline$gam often has much higher edf
edf
## [1] 4.000000 5.000000 9.540483 6.000000
RMSE = round(RMSE,1)
edf = round(edf,1)

Plot

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(predictor), .groups="keep") %>% deframe()
remaining_participants_bins = seq(0,350,by=10)
remaining_participants = data.frame(x=remaining_participants_bins, text=sapply(remaining_participants_bins, function(x) sum(participation_duration>=x)))

xmax = remaining_participants$x[which(remaining_participants$text<10)[1]]

range_90p =quantile(data$value, probs=c(0.05,0.95))

p1 = ggplot() +
  geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.1) +
  geom_line(aes(x,y), data.frame(x=0:xmax, y=predict(smoothing_spline$gam, newdata=data.frame(predictor=0:xmax))), linetype="longdash", size=1) + xlim(0,xmax) + ylim(range_90p[1], range_90p[2]) +
  stat_function(fun=equ_linear, color="blue", linetype="dotted", size=1) + 
  stat_function(fun=equ_quadratic, color="green4", linetype="dashed", size=1) + 
  stat_function(fun=equ_bounded, color=linecolor, size=1) + 
  theme_pubr(base_family="Serif") + no_x + xlab(NULL) + ylab(params$unit) +
  geom_richtext(aes(x,y,label=label,hjust=1), data.frame(x=0.8*xmax, y=range_90p[2]-(range_90p[2]-range_90p[1])/1.3, label=paste0("Model RMSE (edf):<br><span style='color:#0000ff'>····· Linear: ", RMSE[1], " (", edf[1], ") </span><br><span style='color:#008b00'>- - - Quadratic: ", RMSE[2], " (", edf[2], ") </span><br><span style='color:#000000'>— — Smoothing spline: ", RMSE[3], " (", edf[3], ") </span><br><span style='color:", linecolor, "'>— Bounded growth: ", RMSE[4], " (", edf[4], ") </span>")), family="Serif")

p2 = ggplot(remaining_participants[remaining_participants$x<=xmax,]) +
  geom_text(aes(x=x,y="A",label=text), family="Serif") +
  theme_pubr(base_family="Serif") + no_y + xlab(params$xlab) + ylab(paste0("Remaining \n ", params$unit_n, "s")) +
  scale_x_continuous(breaks=seq(0,xmax,by=10))

(p1 / p2) + plot_layout(heights=c(0.9,0.1))
## Warning: Removed 843 row(s) containing missing values (geom_path).

if (params$bounded.growth.confidence.interval) conf = confint.merMod(REM_bounded, c("y0","yf","log_alpha"), method="profile")
if (params$bounded.growth.confidence.interval) {
  print(conf)
  print(exp(conf[3,]))
  print(paste0("Average boundary improvement over baseline: ", round((yf-y0)/y0*100, 1), " (", round((conf[1,1]-conf[2,1])/conf[2,1]*100, 1), "-", round((conf[1,2]-conf[2,2])/conf[2,2]*100, 1), ")"))
}

Bounded growth model

equ_diff_REM_bounded = function(t) exp(log_alpha)*(yf-y0)*exp(exp(log_alpha)*-t)
equ_diff_get_time_REM_bounded = function(target_slope) log(exp(log_alpha)*(yf-y0)/target_slope)/exp(log_alpha)

equ_bounded_get_x = function(target_value) exp(-log_alpha)*log((yf-y0)/(yf-target_value))

growth_percentiles = c(0, 0.5, 0.9)
names_percentiles = c("baseline", "half-practice point", "90% practice")
selected_timepoints = equ_bounded_get_x(y0+(yf-y0)*growth_percentiles)
example_slopes_bounded = data.frame(
  x=selected_timepoints,
  y=equ_bounded(selected_timepoints),
  label=paste0("y=", round(equ_bounded(selected_timepoints),1), ", m=", signif(equ_diff_REM_bounded(selected_timepoints),2), " at ", params$unit_time, " ", round(selected_timepoints,0), ", ", names_percentiles),
  vjust=1.5
)
example_slopes_bounded = rbind(example_slopes_bounded, list(x=0.83*xmax, y=yf, label=paste0("boundary: ", round(yf, 1)), vjust=-1.0))

if (params$bounded.growth.confidence.interval) ribbon = data.frame(x=seq(0,xmax,0.05), ymin=equ_bounded(seq(0,xmax,0.05), conf["yf","2.5 %"], conf["y0","2.5 %"], conf["log_alpha","2.5 %"]), ymax=equ_bounded(seq(0,xmax,0.05), conf["yf","97.5 %"], conf["y0","97.5 %"], conf["log_alpha","97.5 %"]))

quant = quantile(table(data$id))
print(paste0("n tests = ", nrow(data), " (n ", params$unit_n, "s = ", length(unique(data$id)), ", median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 16107 (n hands = 289, median tests per hand: 35, IQR 18-59)"
p1 = ggplot() + geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.2) +
  theme_pubr(base_family="Serif") + scale_x_continuous(limits = c(0,xmax), expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + xlab(params$xlab) + ylab(params$unit) +
  geom_vline(xintercept=example_slopes_bounded[2,"x"], color=linecolor, linetype=2) +
  stat_function(fun=equ_bounded, color=linecolor, size=1) +
  geom_point(data=example_slopes_bounded[1:(nrow(example_slopes_bounded)-1),], aes(x,y), color=linecolor, size=5) +
  geom_text(data=example_slopes_bounded, aes(x,y,label=label, vjust=vjust), color=linecolor, hjust=-0.01, family="Serif")

if (params$bounded.growth.confidence.interval) p1 = p1 + geom_ribbon(aes(x=x, ymin=ymin, ymax=ymax), ribbon, fill=linecolor, alpha=0.3)

p1
## Warning: Removed 423 row(s) containing missing values (geom_path).

Quantile regression

if (is.null(params$censor_after)) {
  censor_after = as.integer(round(selected_timepoints[2])) # half-practice point
} else {
  censor_after = params$censor_after
}
data_censored = data[data$predictor <= censor_after,]

percentiles = c(0.05,0.25,0.5,0.75,0.95)

QR = rq(value ~ predictor, tau=percentiles, data_censored)

p_vals = sapply(1:length(summary(QR)), function(i) {
  summ = coef(summary(QR, se="ker")[[i]])
  print(summ)
  print(paste0("Intercept: ", round(summ[1,1],1), " (", round(summ[1,1]-1.96*summ[1,2],1), "-", round(summ[1,1]+1.96*summ[1,2],1), "), beta: ", round(summ[2,1],2), " (", round(summ[2,1]-1.96*summ[2,2],2), "-", round(summ[2,1]+1.96*summ[2,2],2), ")"))
  summ[2,4]
})
##                Value Std. Error   t value Pr(>|t|)
## (Intercept) 6.465098  0.6238065 10.363946        0
## predictor   1.020881  0.1046404  9.756092        0
## [1] "Intercept: 6.5 (5.2-7.7), beta: 1.02 (0.82-1.23)"
##                 Value Std. Error  t value Pr(>|t|)
## (Intercept) 21.663820 0.45703834 47.40044        0
## predictor    1.389721 0.08702302 15.96958        0
## [1] "Intercept: 21.7 (20.8-22.6), beta: 1.39 (1.22-1.56)"
##                 Value Std. Error  t value Pr(>|t|)
## (Intercept) 32.119116 0.47953897 66.97916        0
## predictor    1.480253 0.07493568 19.75365        0
## [1] "Intercept: 32.1 (31.2-33.1), beta: 1.48 (1.33-1.63)"
##                 Value Std. Error  t value Pr(>|t|)
## (Intercept) 43.918104 0.43964242 99.89506        0
## predictor    1.049899 0.06975284 15.05170        0
## [1] "Intercept: 43.9 (43.1-44.8), beta: 1.05 (0.91-1.19)"
##                  Value Std. Error    t value Pr(>|t|)
## (Intercept) 55.0603317  0.5143569 107.046949        0
## predictor    0.8246257  0.0847887   9.725656        0
## [1] "Intercept: 55.1 (54.1-56.1), beta: 0.82 (0.66-0.99)"
p_vals = p.adjust(p_vals, method="bonferroni")

ANOVA = anova(QR)

quant = quantile(table(data_censored$id))
print(paste0("n tests = ", nrow(data_censored), " (median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 6243 (median tests per hand: 22, IQR 11-32)"
signif_p = function(x, digits=1) {
  x = signif(x, digits)
  if (as.character(x) == "0") return("< 2e-16")
  else return(paste0("= ", x))
}

ggplot() + geom_line(aes_string("predictor", "value", group="id"), data_censored, alpha=0.2, color="darkgrey") + theme_pubr(base_family="Serif") + scale_x_continuous(expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + theme(legend.position = "none") + xlab(params$xlab) + ylab(params$unit) +
  geom_abline(intercept=coef(QR)[1,], slope=coef(QR)[2,], color=linecolor) +
  geom_text(data=data.frame(intercept=coef(QR)[1,], label=paste0(percentiles*100, "th percentile: β = ", round(coef(QR)[2,],1), ", ", "p.adj ", sapply(p_vals,signif_p))),
            mapping=aes(x=1,y=intercept, label=label), color=linecolor, hjust="left", vjust=1, family="Serif") +
  coord_cartesian(xlim=c(0,censor_after)) +
  geom_text(aes(x=x,y=y,label=label), data.frame(x=0.8*censor_after, y=0, label=paste0("ANOVA p ", signif_p(ANOVA$table$pvalue, 1))), vjust=-1.5, family="Serif") +
  geom_vline(xintercept=censor_after, color=linecolor, linetype=2)

Mobility

params = list(
  test_code = "two_min_walk",
  test_metric_code = "steps",
  unit = "Two Minute Walk: Steps",
  unit_n = "patient",
  unit_time = "week",
  min_repetitions = 10,
  min_weeks = 10,
  predictor = "weeksSinceFirst",
  xlab = "Weeks",
  bounded.growth.confidence.interval = F,
  censor_after = 11, # allow comparison with cognition
  up_to_date = "2021-05-01"
)
library(data.table) # fread
library(parsedate) # parse_date
library(dplyr) # group_by
library(tibble) # deframe
library(lme4) # lmer
library(mgcv) # gamm
library(quantreg) # rq
library(patchwork) # plot_layout
library(gridExtra) # grid.arrange
library(ggpubr) # ggscatter
library(ggtext) # geom_text
library(sjPlot) # plot_model

# Download from: https://dataset.floodlightopen.com/public-blobs-prod/complete_dataset.csv
data = fread("complete_dataset.csv", data.table=F)

# Prepare dataset
data = data[data$testCode == params$test_code & !data$participantIsControl,]
data$time = parse_date(data$testStartedAt)
data = data[data$time <= as.POSIXct(params$up_to_date, tz="UTC"),] # only analyse data up to (excluding) params$up_to_date
data = data[!duplicated(data),] # sometimes contains true duplicates for some reason (even with the same testResultMetricId)

# For "Finger Pinching" hand_used has to be determined
if (params$test_code == "pinching") {
  library(tidyr) # pivot_wider
  # just one means either "hand" or "successful_pinches" values are missing, remove those
  table(table(data$time))
  data = data[!data$time %in% names(which(table(data$time)==1)), ]
  data = as.data.frame(pivot_wider(data, id_cols=c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time"), names_from="testMetricCode", values_from="testResultMetricValue"))
} else {
  data = data[data$testMetricCode == params$test_metric_code,]
  data$hand_used = NA
  data = data[c("floodlightOpenId", "participantIsControl", "participantSex", "participantBirthYear", "time", "testResultMetricValue", "hand_used")]
}

colnames(data) = c("id", "control", "sex", "birthyear", "time", "value", "hand_used")
data$age = year(data$time)-data$birthyear # Estimate age
data = data[order(as.character(data$id)),]

# 0 result values are discarded
data = data[!is.na(data$value) & data$value != 0,]

# Consider those supposedly younger than 18 (minimum study age) and older than 90 as NA
data$age[data$age < 18 | data$age > 90] = NA

data$id_original = data$id
data$id = paste0(data$id, "_hand", data$hand_used)

data$day = as.IDate(data$time)
round = function(x, digits=0) sprintf(paste0("%.", digits, "f"), x)

no_x = theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
no_y = theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())

linecolor = "#c71138"

Participant selection

# At least x weeks & repetitions
for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "repetition"] = (1:n)-1
  data[subset, "weeksSinceFirst"] = as.numeric(difftime(data[subset, "time"], data[subset, "time"][1], unit="weeks"))
}

n_orig = nrow(data)
n_patients_orig = length(unique(data$id_original))
n_hands_orig = length(unique(data$id))

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(weeksSinceFirst), repetitions=last(repetition), .groups="keep")

Among the total n=540 patients with n=14031 repetitions, the median length of participation is 0.9 weeks (IQR 0.0-11.2, range 0.0-133.5) and the median number of repetitions is 3 (IQR 1-14.25, range 1-735).

data = data[data$id %in% participation_duration$id[participation_duration$weeks >= params$min_weeks & participation_duration$repetitions+1 >= params$min_repetitions],]

for (id in unique(data$id)) {
  subset = data$id == id
  n = sum(subset)
  data[subset, "daysSinceLast"] = as.numeric(difftime(data[subset, "time"], c(data[subset, "time"][1], data[subset, "time"][1:(n-1)]), unit="days"))
}

participation_duration = data %>% group_by(id) %>% summarise(sex=first(sex), mean_age=mean(age), weeks=last(weeksSinceFirst), repetitions=last(repetition), median_intertest_interval=median(daysSinceLast), IQR_intertest_interval=IQR(daysSinceLast), .groups="keep")

data$predictor = data[,params$predictor]

Inclusion criteria: participation for at least 10 weeks and at least 10 repetitions performed per test, leading to the analysis of n=118 / 540 patients and n=12253 / 14031 tests. Among those, the median length of participation is 27.9 weeks (IQR 15.3-55.8, range 10.3-133.5) and the median number of repetitions is 61.5 (IQR 23.5-108.75, range 10-735).

t(data.frame(
  n_patients = paste0(length(unique(data$id_original)), " / ", n_patients_orig, " (", round(length(unique(data$id_original))/n_patients_orig*100,1), "%)"),
  n_hands = paste0(length(unique(data$id)), " / ", n_hands_orig, " (", round(length(unique(data$id))/n_hands_orig*100,1), "%)"),
  n_tests = paste0(nrow(data), " / ", n_orig, " (", round(nrow(data)/n_orig*100,1), "%)"),
  percent_female = paste0(round(prop.table(table(participation_duration$sex == "female"))[[2]]*100, 1)),
  age = paste0(round(median(participation_duration$mean_age,na.rm=T),1), " (", round(quantile(participation_duration$mean_age, 0.25, na.rm=T),1), "-", round(quantile(participation_duration$mean_age, 0.75, na.rm=T),1), ", range ", round(min(participation_duration$mean_age, na.rm=T),1), "-", round(max(participation_duration$mean_age, na.rm=T),1), ")"),
  repetitions = paste0(median(participation_duration$repetitions)+1, " repetitions (IQR ", quantile(participation_duration$repetitions+1, 0.25), "-", quantile(participation_duration$repetitions+1, 0.75), ", range ", min(participation_duration$repetitions+1), "-", max(participation_duration$repetitions+1), ")"),
  median_intertest_interval = paste0(round(median(participation_duration$median_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$median_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$median_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$median_intertest_interval),1), "-", round(max(participation_duration$median_intertest_interval),1), ")"),
  IQR_intertest_interval = paste0(round(median(participation_duration$IQR_intertest_interval),1), " days (IQR ", round(quantile(participation_duration$IQR_intertest_interval, 0.25),1), "-", round(quantile(participation_duration$IQR_intertest_interval, 0.75),1), ", range ", round(min(participation_duration$IQR_intertest_interval),1), "-", round(max(participation_duration$IQR_intertest_interval),1), ")"),
  weeks = paste0(round(median(participation_duration$weeks),1), " weeks (IQR ", round(quantile(participation_duration$weeks, 0.25),1), "-", round(quantile(participation_duration$weeks, 0.75),1), ", range ", round(min(participation_duration$weeks),1), "-", round(max(participation_duration$weeks),1), ")")
))
##                           [,1]                                              
## n_patients                "118 / 540 (21.9%)"                               
## n_hands                   "118 / 540 (21.9%)"                               
## n_tests                   "12253 / 14031 (87.3%)"                           
## percent_female            "69.5"                                            
## age                       "50.2 (43.0-58.0, range 25.0-74.3)"               
## repetitions               "61.5 repetitions (IQR 23.5-108.75, range 10-735)"
## median_intertest_interval "1.1 days (IQR 1.0-2.1, range 1.0-24.9)"          
## IQR_intertest_interval    "1.6 days (IQR 0.4-4.0, range 0.1-32.0)"          
## weeks                     "27.9 weeks (IQR 15.3-55.8, range 10.3-133.5)"

Summary level analysis

Difference test

df = as.data.frame(data %>% group_by(id) %>% summarise(first=first(value), last=last(value), mean=mean(value), weeksSinceFirst=max(weeksSinceFirst), repetition=n(), first_age=first(age), last_age=last(age), mean_age=mean(age), .groups="keep") %>% mutate(diff=last-first))

df$predictor = df[, params$predictor]

test = t.test(df$last, df$first, paired=T)
test
## 
##  Paired t-test
## 
## data:  df$last and df$first
## t = 0.75247, df = 117, p-value = 0.4533
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.407502 12.034621
## sample estimates:
## mean of the differences 
##                3.313559
mod0 = lm(diff ~ I(mean_age/10) + I(first/10), df)
summ0 = summary(mod0)
summ0
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -138.104  -18.099    6.796   25.089  124.618 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     64.0343    31.4890   2.034 0.044320 *  
## I(mean_age/10)   2.7008     4.2556   0.635 0.526936    
## I(first/10)     -3.5897     0.8972  -4.001 0.000112 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.97 on 114 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1388, Adjusted R-squared:  0.1236 
## F-statistic: 9.183 on 2 and 114 DF,  p-value: 0.0002006
mod = lm(diff ~ I(mean_age/10) + I(first/10) + log10(predictor), df)
confint(mod)
##                       2.5 %     97.5 %
## (Intercept)      -16.285896 116.556238
## I(mean_age/10)    -7.742960   9.921057
## I(first/10)       -5.461403  -1.900159
## log10(predictor) -10.811657  43.439895
summ = summary(mod)
summ
## 
## Call:
## lm(formula = diff ~ I(mean_age/10) + I(first/10) + log10(predictor), 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -138.264  -18.055    6.996   23.432  119.744 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       50.1352    33.5260   1.495    0.138    
## I(mean_age/10)     1.0890     4.4580   0.244    0.807    
## I(first/10)       -3.6808     0.8988  -4.095 7.95e-05 ***
## log10(predictor)  16.3141    13.6917   1.192    0.236    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.89 on 113 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1494, Adjusted R-squared:  0.1269 
## F-statistic: 6.618 on 3 and 113 DF,  p-value: 0.000369
# additional variance explained by predictor
#print(summ$r.squared - summ0$r.squared)

print(paste0("Average observed improvement over baseline: ", round(test$estimate/mean(df$first)*100, 1), " (", round(test$conf[1]/mean(df$first)*100, 1), "-", round(test$conf[2]/mean(df$first)*100, 1), ")"))
## [1] "Average observed improvement over baseline: 1.6 (-2.6-5.8)"
lab.y = 1.1*mean(df$last)

p1 = ggbarplot(data.frame(Timepoint=rep(c("First","Mean","Last"),each=nrow(df)), value=c(df$first,df$mean,df$last)), "Timepoint", "value", add="mean_se", label=T, lab.nb.digits=1, lab.vjust=1.9, ylab=params$unit) + xlab("Score") #+ stat_compare_means(comparisons = list(c("First","Last")), paired=T, method="t.test", label.y=lab.y) + scale_y_continuous(expand=expansion(mult=c(0,0.1)))

p2 = plot_model(summ, show.values=T, vline.color = "grey", show.intercept=T, colors=linecolor, title=paste0("Difference from First to Last Score, R²=", round(summ$r.squared, 2)), axis.labels=rev(c("Intercept", "Age (per 10 years)", "First score (per 10)", paste0(params$xlab, " (log 10)"))), value.offset=0.3, show.p=F) + ylab("β estimates")

(p1 + p2) + plot_layout(widths=c(2,5)) & theme_pubr(base_family="Serif")

Confounders

p_age_first = ggscatter(df, "mean_age", "first", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("First score") + theme_pubr(base_family="Serif")

p_age_pred = ggscatter(df, "mean_age", "predictor", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Mean age") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_age_last = ggscatter(df, "mean_age", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Last score") + theme_pubr(base_family="Serif")

p_age_diff = ggscatter(df, "mean_age", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Mean age") + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_first_pred = ggscatter(df, "first", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("First score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")

p_first_last = ggscatter(df, "first", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_first_diff = ggscatter(df, "first", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("First score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_pred_last = ggscatter(df, "predictor", "last", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Last score") + theme_pubr(base_family="Serif")

p_pred_diff = ggscatter(df, "predictor", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_x_log10(expand = expansion(mult = c(.05, .15))) + xlab(paste0(params$xlab, " (log10)")) + ylab("Difference first to last score") + theme_pubr(base_family="Serif")

p_last_diff = ggscatter(df, "last", "diff", add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + xlab("Last score") + ylab("Difference first to last score") + theme_pubr(base_family="Serif") #+ geom_abline(intercept=0,slope=1)

p_last_pred = ggscatter(df, "last", "predictor",  add="reg.line", alpha=0.2, cor.coef=T, cor.coeff.args=list(color=linecolor), conf.int=T, add.params=list(color=linecolor)) + scale_y_log10() + xlab("Last score") + ylab(paste0(params$xlab, " (log10)")) + theme_pubr(base_family="Serif")


p_age = gghistogram(df, "mean_age", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_first = gghistogram(df, "first", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_last = gghistogram(df, "last", bins=15) + xlab(NULL) + theme_pubr(base_family="Serif")
p_pred = gghistogram(df, "predictor", bins=15) + scale_x_log10() + xlab(NULL) + theme_pubr(base_family="Serif")
p_diff = gghistogram(df, "diff", bins=15) + xlab("Difference first to last") + theme_pubr(base_family="Serif")

#(((p1+xlab(NULL)) + (p2+xlab(NULL)+ylab(NULL))) / ((p3+xlab(NULL)) + (p4+xlab(NULL)+ylab(NULL))) / ((p5) | (p6+ylab(NULL)))) & theme_pubr(base_family="Serif")

#(p_age_first | p_first) / (p_age_last | p_first_last | p_last) / (p_age_pred | p_first_pred | p_last_pred | p_pred) / (p_age_diff | p_first_diff | p_last_diff | p_pred_diff)

m <- matrix(NA, 5, 5)
m[lower.tri(m, diag = T)] <- 1:15
grid.arrange(grobs=list(
  p_age, p_age_first+xlab(NULL), p_age_pred+xlab(NULL), p_age_last+xlab(NULL), p_age_diff,
  p_first, p_first_pred+xlab(NULL)+ylab(""), p_first_last+xlab(NULL)+ylab(""), p_first_diff+ylab(""),
  p_pred, p_pred_last+xlab(NULL)+ylab(""), p_pred_diff+ylab(""),
  p_last, p_last_diff+ylab(""), 
  p_diff
), layout_matrix=m, heights=c(1,1,1,1,1.1))
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_cor).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#GGally::ggpairs(df[c("mean_age", "first", "last", "predictor", "diff")])
#pairs(df[c("mean_age", "first", "last", "predictor", "diff")], upper.panel=NULL)
#corrplot::corrplot(cor(df[c("mean_age", "first", "last", "predictor", "diff")], use="complete.obs"))

Learning curve: Model selection

smoothing_spline = gamm(value ~ s(predictor, bs="ps"), random=list(id=~1), data=data)
summary(smoothing_spline$lme)
## Linear mixed-effects model fit by maximum likelihood
##  Data: strip.offset(mf) 
##        AIC      BIC    logLik
##   114666.4 114703.4 -57328.19
## 
## Random effects:
##  Formula: ~Xr - 1 | g
##  Structure: pdIdnot
##               Xr1       Xr2       Xr3       Xr4       Xr5       Xr6       Xr7
## StdDev: 0.8099883 0.8099883 0.8099883 0.8099883 0.8099883 0.8099883 0.8099883
##               Xr8
## StdDev: 0.8099883
## 
##  Formula: ~1 | id %in% g
##         (Intercept) Residual
## StdDev:    42.36131 25.40256
## 
## Fixed effects: y ~ X - 1 
##                      Value Std.Error    DF  t-value p-value
## X(Intercept)     211.23964  3.923285 12134 53.84255  0.0000
## Xs(predictor)Fx1  15.04985  5.293679 12134  2.84298  0.0045
##  Correlation: 
##                  X(Int)
## Xs(predictor)Fx1 -0.015
## 
## Standardized Within-Group Residuals:
##          Min           Q1          Med           Q3          Max 
## -11.44901916  -0.27753842   0.09347125   0.42997046   6.74370835 
## 
## Number of Observations: 12253
## Number of Groups: 
##         g id %in% g 
##         1       118
summary(smoothing_spline$gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## value ~ s(predictor, bs = "ps")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  211.240      3.923   53.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df     F  p-value    
## s(predictor) 4.081  4.081 5.771 0.000107 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  -0.00143   
##   Scale est. = 645.29    n = 12253
REM_linear = lmer(value ~ (1|id) + predictor, data)
equ_linear = function(t) fixef(REM_linear)[1] + fixef(REM_linear)[2]*t
summary(REM_linear)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor
##    Data: data
## 
## REML criterion at convergence: 114675.3
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -11.4469  -0.2753   0.0913   0.4294   6.7296 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 1809.0   42.53   
##  Residual              646.7   25.43   
## Number of obs: 12253, groups:  id, 118
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) 211.34434    3.94004  53.640
## predictor    -0.00522    0.01205  -0.433
## 
## Correlation of Fixed Effects:
##           (Intr)
## predictor -0.050
REM_quadratic = lmer(value ~ (1|id) + predictor + I(predictor^2), data)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
equ_quadratic = function(t) fixef(REM_quadratic)[1] + fixef(REM_quadratic)[2]*t + fixef(REM_quadratic)[3]*t^2
summary(REM_quadratic)
## Linear mixed model fit by REML ['lmerMod']
## Formula: value ~ (1 | id) + predictor + I(predictor^2)
##    Data: data
## 
## REML criterion at convergence: 114678.8
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -11.4567  -0.2791   0.0909   0.4277   6.7788 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  id       (Intercept) 1808.8   42.53   
##  Residual              646.2   25.42   
## Number of obs: 12253, groups:  id, 118
## 
## Fixed effects:
##                  Estimate Std. Error t value
## (Intercept)     2.106e+02  3.947e+00  53.356
## predictor       8.227e-02  2.912e-02   2.826
## I(predictor^2) -9.705e-04  2.940e-04  -3.301
## 
## Correlation of Fixed Effects:
##             (Intr) prdctr
## predictor   -0.074       
## I(prdctr^2)  0.059 -0.910
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
REM_bounded = nlmer(value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0|id) + (yf|id), data = data, start=c(yf=40, y0=20, log_alpha=-1))
y0=fixef(REM_bounded)["y0"]
yf=fixef(REM_bounded)["yf"]
log_alpha=fixef(REM_bounded)["log_alpha"]
equ_bounded = function(t, yf=fixef(REM_bounded)[["yf"]], y0=fixef(REM_bounded)[["y0"]], log_alpha=fixef(REM_bounded)[["log_alpha"]]) yf+(y0-yf)*exp(-exp(log_alpha)*t)
summary(REM_bounded)
## Nonlinear mixed model fit by maximum likelihood  ['nlmerMod']
## Formula: value ~ SSasymp(predictor, yf, y0, log_alpha) ~ (y0 | id) + (yf |  
##     id)
##    Data: data
## 
##      AIC      BIC   logLik deviance df.resid 
## 113429.1 113473.6 -56708.6 113417.1    12247 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -12.3066  -0.2840   0.0746   0.4078   5.3309 
## 
## Random effects:
##  Groups   Name Variance Std.Dev.
##  id       y0   2050.3   45.28   
##  id.1     yf   2213.5   47.05   
##  Residual       567.1   23.81   
## Number of obs: 12253, groups:  id, 118
## 
## Fixed effects:
##            Estimate Std. Error t value
## yf        209.80135    0.08650 2425.48
## y0        212.21458    0.04741 4476.24
## log_alpha  -1.86878    0.07759  -24.09
## 
## Correlation of Fixed Effects:
##           yf     y0    
## y0        -0.008       
## log_alpha  0.001  0.000
exp(log_alpha)
## log_alpha 
## 0.1543115
cat("Average improvement over baseline: ", (yf-y0)/y0*100)
## Average improvement over baseline:  -1.137165
RMSE = sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) sqrt(mean(resid(mod)^2))) # RMSE
RMSE
## [1] 25.30796 25.29662 25.27767 23.59285
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) mean(abs(resid(mod)))) # MAE
## [1] 15.62753 15.61962 15.58534 14.45781
sapply(list(REM_linear, REM_quadratic, smoothing_spline$lme, REM_bounded), function(mod) extractAIC(mod)) # edf & AIC: smoothing_spline$lme always has edf 5
##          [,1]     [,2]     [,3]     [,4]
## [1,]      4.0      5.0      5.0      6.0
## [2,] 114680.8 114671.9 114666.4 113429.1
edf = sapply(list(REM_linear, REM_quadratic, smoothing_spline$gam, REM_bounded), function(mod) { nrow(data)-df.residual(mod) }) # while smoothing_spline$gam often has much higher edf
edf
## [1] 4.000000 5.000000 5.080598 6.000000
RMSE = round(RMSE,1)
edf = round(edf,1)

Plot

participation_duration = data %>% group_by(id) %>% summarise(weeks=last(predictor), .groups="keep") %>% deframe()
remaining_participants_bins = seq(0,350,by=10)
remaining_participants = data.frame(x=remaining_participants_bins, text=sapply(remaining_participants_bins, function(x) sum(participation_duration>=x)))

xmax = remaining_participants$x[which(remaining_participants$text<10)[1]]

range_90p =quantile(data$value, probs=c(0.05,0.95))

p1 = ggplot() +
  geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.1) +
  geom_line(aes(x,y), data.frame(x=0:xmax, y=predict(smoothing_spline$gam, newdata=data.frame(predictor=0:xmax))), linetype="longdash", size=1) + xlim(0,xmax) + ylim(range_90p[1], range_90p[2]) +
  stat_function(fun=equ_linear, color="blue", linetype="dotted", size=1) + 
  stat_function(fun=equ_quadratic, color="green4", linetype="dashed", size=1) + 
  stat_function(fun=equ_bounded, color=linecolor, size=1) + 
  theme_pubr(base_family="Serif") + no_x + xlab(NULL) + ylab(params$unit) +
  geom_richtext(aes(x,y,label=label,hjust=1), data.frame(x=0.8*xmax, y=range_90p[2]-(range_90p[2]-range_90p[1])/1.3, label=paste0("Model RMSE (edf):<br><span style='color:#0000ff'>····· Linear: ", RMSE[1], " (", edf[1], ") </span><br><span style='color:#008b00'>- - - Quadratic: ", RMSE[2], " (", edf[2], ") </span><br><span style='color:#000000'>— — Smoothing spline: ", RMSE[3], " (", edf[3], ") </span><br><span style='color:", linecolor, "'>— Bounded growth: ", RMSE[4], " (", edf[4], ") </span>")), family="Serif")

p2 = ggplot(remaining_participants[remaining_participants$x<=xmax,]) +
  geom_text(aes(x=x,y="A",label=text), family="Serif") +
  theme_pubr(base_family="Serif") + no_y + xlab(params$xlab) + ylab(paste0("Remaining \n ", params$unit_n, "s")) +
  scale_x_continuous(breaks=seq(0,xmax,by=10))

(p1 / p2) + plot_layout(heights=c(0.9,0.1))
## Warning: Removed 835 row(s) containing missing values (geom_path).

if (params$bounded.growth.confidence.interval) conf = confint.merMod(REM_bounded, c("y0","yf","log_alpha"), method="profile")
if (params$bounded.growth.confidence.interval) {
  print(conf)
  print(exp(conf[3,]))
  print(paste0("Average boundary improvement over baseline: ", round((yf-y0)/y0*100, 1), " (", round((conf[1,1]-conf[2,1])/conf[2,1]*100, 1), "-", round((conf[1,2]-conf[2,2])/conf[2,2]*100, 1), ")"))
}

Bounded growth model

equ_diff_REM_bounded = function(t) exp(log_alpha)*(yf-y0)*exp(exp(log_alpha)*-t)
equ_diff_get_time_REM_bounded = function(target_slope) log(exp(log_alpha)*(yf-y0)/target_slope)/exp(log_alpha)

equ_bounded_get_x = function(target_value) exp(-log_alpha)*log((yf-y0)/(yf-target_value))

growth_percentiles = c(0, 0.5, 0.9)
names_percentiles = c("baseline", "half-practice point", "90% practice")
selected_timepoints = equ_bounded_get_x(y0+(yf-y0)*growth_percentiles)
example_slopes_bounded = data.frame(
  x=selected_timepoints,
  y=equ_bounded(selected_timepoints),
  label=paste0("y=", round(equ_bounded(selected_timepoints),1), ", m=", signif(equ_diff_REM_bounded(selected_timepoints),2), " at ", params$unit_time, " ", round(selected_timepoints,0), ", ", names_percentiles),
  vjust=1.5
)
example_slopes_bounded = rbind(example_slopes_bounded, list(x=0.83*xmax, y=yf, label=paste0("boundary: ", round(yf, 1)), vjust=-1.0))

if (params$bounded.growth.confidence.interval) ribbon = data.frame(x=seq(0,xmax,0.05), ymin=equ_bounded(seq(0,xmax,0.05), conf["yf","2.5 %"], conf["y0","2.5 %"], conf["log_alpha","2.5 %"]), ymax=equ_bounded(seq(0,xmax,0.05), conf["yf","97.5 %"], conf["y0","97.5 %"], conf["log_alpha","97.5 %"]))

quant = quantile(table(data$id))
print(paste0("n tests = ", nrow(data), " (n ", params$unit_n, "s = ", length(unique(data$id)), ", median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 12253 (n patients = 118, median tests per patient: 61.5, IQR 23.5-108.75)"
p1 = ggplot() + geom_line(aes_string("predictor", "value", group="id"), data, color="darkgrey", alpha=0.2) +
  theme_pubr(base_family="Serif") + scale_x_continuous(limits = c(0,xmax), expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + xlab(params$xlab) + ylab(params$unit) +
  geom_vline(xintercept=example_slopes_bounded[2,"x"], color=linecolor, linetype=2) +
  stat_function(fun=equ_bounded, color=linecolor, size=1) +
  geom_point(data=example_slopes_bounded[1:(nrow(example_slopes_bounded)-1),], aes(x,y), color=linecolor, size=5) +
  geom_text(data=example_slopes_bounded, aes(x,y,label=label, vjust=vjust), color=linecolor, hjust=-0.01, family="Serif")

if (params$bounded.growth.confidence.interval) p1 = p1 + geom_ribbon(aes(x=x, ymin=ymin, ymax=ymax), ribbon, fill=linecolor, alpha=0.3)

p1
## Warning: Removed 609 row(s) containing missing values (geom_path).

Quantile regression

if (is.null(params$censor_after)) {
  censor_after = as.integer(round(selected_timepoints[2])) # half-practice point
} else {
  censor_after = params$censor_after
}
data_censored = data[data$predictor <= censor_after,]

percentiles = c(0.05,0.25,0.5,0.75,0.95)

QR = rq(value ~ predictor, tau=percentiles, data_censored)

p_vals = sapply(1:length(summary(QR)), function(i) {
  summ = coef(summary(QR, se="ker")[[i]])
  print(summ)
  print(paste0("Intercept: ", round(summ[1,1],1), " (", round(summ[1,1]-1.96*summ[1,2],1), "-", round(summ[1,1]+1.96*summ[1,2],1), "), beta: ", round(summ[2,1],2), " (", round(summ[2,1]-1.96*summ[2,2],2), "-", round(summ[2,1]+1.96*summ[2,2],2), ")"))
  summ[2,4]
})
##                  Value Std. Error   t value   Pr(>|t|)
## (Intercept) 116.150965  3.7063739 31.338167 0.00000000
## predictor     1.380726  0.7039671  1.961349 0.04990248
## [1] "Intercept: 116.2 (108.9-123.4), beta: 1.38 (0.00-2.76)"
##                   Value Std. Error   t value  Pr(>|t|)
## (Intercept) 199.0430454  2.1436351 92.853044 0.0000000
## predictor     0.5198164  0.3339589  1.556528 0.1196559
## [1] "Intercept: 199.0 (194.8-203.2), beta: 0.52 (-0.13-1.17)"
##                   Value Std. Error     t value  Pr(>|t|)
## (Intercept) 228.2385159  1.1758108 194.1115934 0.0000000
## predictor    -0.1166321  0.1886001  -0.6184094 0.5363381
## [1] "Intercept: 228.2 (225.9-230.5), beta: -0.12 (-0.49-0.25)"
##                   Value Std. Error    t value    Pr(>|t|)
## (Intercept) 248.2734655  1.1073957 224.195809 0.000000000
## predictor    -0.4933655  0.1810535  -2.724971 0.006456638
## [1] "Intercept: 248.3 (246.1-250.4), beta: -0.49 (-0.85--0.14)"
##                    Value Std. Error      t value Pr(>|t|)
## (Intercept) 2.720000e+02  1.6816212 1.617487e+02        0
## predictor   3.363484e-14  0.2695825 1.247664e-13        1
## [1] "Intercept: 272.0 (268.7-275.3), beta: 0.00 (-0.53-0.53)"
p_vals = p.adjust(p_vals, method="bonferroni")

ANOVA = anova(QR)

quant = quantile(table(data_censored$id))
print(paste0("n tests = ", nrow(data_censored), " (median tests per ", params$unit_n, ": ", quant["50%"], ", IQR ", quant["25%"], "-", quant["75%"], ")"))
## [1] "n tests = 4317 (median tests per patient: 35.5, IQR 15-57.75)"
signif_p = function(x, digits=1) {
  x = signif(x, digits)
  if (as.character(x) == "0") return("< 2e-16")
  else return(paste0("= ", x))
}

ggplot() + geom_line(aes_string("predictor", "value", group="id"), data_censored, alpha=0.2, color="darkgrey") + theme_pubr(base_family="Serif") + scale_x_continuous(expand = expansion(mult = c(0, 0))) + scale_y_continuous(expand = expansion(mult = c(0, 0))) + theme(legend.position = "none") + xlab(params$xlab) + ylab(params$unit) +
  geom_abline(intercept=coef(QR)[1,], slope=coef(QR)[2,], color=linecolor) +
  geom_text(data=data.frame(intercept=coef(QR)[1,], label=paste0(percentiles*100, "th percentile: β = ", round(coef(QR)[2,],1), ", ", "p.adj ", sapply(p_vals,signif_p))),
            mapping=aes(x=1,y=intercept, label=label), color=linecolor, hjust="left", vjust=1, family="Serif") +
  coord_cartesian(xlim=c(0,censor_after)) +
  geom_text(aes(x=x,y=y,label=label), data.frame(x=0.8*censor_after, y=0, label=paste0("ANOVA p ", signif_p(ANOVA$table$pvalue, 1))), vjust=-1.5, family="Serif") +
  geom_vline(xintercept=censor_after, color=linecolor, linetype=2)